fixed get stocks functions
[sgn.git] / lib / CXGN / Dataset.pm
blobfee3e0f4d36bc5a797137d4d4ef8a9def29b5a08
2 =head1 NAME
4 CXGN::Dataset - a class to easily query the database for breeding data
6 =head1 DESCRIPTION
8 CXGN::Dataset can be used to flexibly define datasets for breeding applications. For example, a dataset can be defined using a list of germplasm, a list of trials, a list of years, etc, or a combination of the above. Once defined, it allows to easily obtain related phenotypes and genotypes and other data.
10 Datasets can be stored in the database and retrieved for later use.
12 Currently, there are three incarnations of CXGN::Dataset:
14 =over 5
16 =item CXGN::Dataset
18 Unbuffered output of the queries
20 =item CXGN::Dataset::File
22 Writes results to files
24 =item CXGN::Dataset::Cache
26 Returns output like CXGN::Dataset, but uses a disk-cache for the response data
28 =back
30 =head1 SYNOPSYS
32 my $ds = CXGN::Dataset->new( people_schema => $p, schema => $s);
33 $ds->accessions([ 'a', 'b', 'c' ]);
34 my $trials = $ds->retrieve_trials();
35 my $sp_dataset_id = $ds->store();
36 #...
37 my $restored_ds = CXGN::Dataset( people_schema => $p, schema => $s, sp_dataset_id => $sp_dataset_id );
38 my $years = $restored_ds->retrieve_years();
39 #...
41 =head1 AUTHOR
43 Lukas Mueller <lam87@cornell.edu>
46 =head1 ACCESSORS
48 =cut
51 package CXGN::Dataset;
53 use Moose;
54 use Moose::Util::TypeConstraints;
55 use Data::Dumper;
56 use JSON::Any;
57 use CXGN::BreederSearch;
58 use CXGN::People::Schema;
59 use CXGN::Phenotypes::PhenotypeMatrix;
60 use CXGN::Genotype::Search;
62 =head2 people_schema()
64 accessor for CXGN::People::Schema database object
66 =cut
68 has 'people_schema' => (isa => 'CXGN::People::Schema', is => 'rw', required => 1 );
70 =head2 schema()
72 accessor for Bio::Chado::Schema database object
74 =cut
76 has 'schema' => ( isa => "Bio::Chado::Schema", is => 'rw', required => 1 );
78 =head2 sp_dataset_id()
80 accessor for sp_dataset primary key
82 =cut
85 has 'sp_dataset_id' => ( isa => 'Int',
86 is => 'rw',
87 predicate => 'has_sp_dataset_id',
90 =head2 data()
92 accessor for the json-formatted data structure (as used for the backend storage)
94 =cut
96 has 'data' => ( isa => 'HashRef',
97 is => 'rw'
100 =head2 name()
102 accessor for the name of this dataset
104 =cut
106 has 'name' => ( isa => 'Maybe[Str]',
107 is => 'rw',
110 =head2 description()
112 accessor for the descrition of this dataset
114 =cut
116 has 'description' => ( isa => 'Maybe[Str]',
117 is => 'rw'
120 =head2 sp_person_id()
122 accessor for sp_person_id (owner of the dataset)
124 =cut
126 has 'sp_person_id' => ( isa => 'Maybe[Int]',
127 is => 'rw',
131 =head2 accessions()
133 accessor for defining the accessions that are part of this dataset (ArrayRef).
135 =cut
137 has 'accessions' => ( isa => 'Maybe[ArrayRef]',
138 is => 'rw',
139 predicate => 'has_accessions',
142 =head2 plots()
144 accessor for defining the plots that are part of this dataset (ArrayRef).
146 =cut
148 has 'plots' => ( isa => 'Maybe[ArrayRef]',
149 is => 'rw',
150 predicate => 'has_plots',
153 =head2 plants()
155 accessor for defining the plants that are part of this dataset (ArrayRef).
157 =cut
159 has 'plants' => ( isa => 'Maybe[ArrayRef]',
160 is => 'rw',
161 predicate => 'has_plants',
166 =head2 trials()
168 accessor for defining the trials that are part of this dataset (ArrayRef).
170 =cut
173 has 'trials' => ( isa => 'Maybe[ArrayRef]',
174 is => 'rw',
175 predicate => 'has_trials',
179 =head2 traits()
181 =cut
183 has 'traits' => ( isa => 'Maybe[ArrayRef]',
184 is => 'rw',
185 predicate => 'has_traits',
188 =head2 years()
190 =cut
193 has 'years' => ( isa => 'Maybe[ArrayRef]',
194 is => 'rw',
195 predicate => 'has_years',
198 =head2 breeding_programs()
200 =cut
202 has 'breeding_programs' => ( isa => 'Maybe[ArrayRef]',
203 is => 'rw',
204 predicate => 'has_breeding_programs',
205 default => sub { [] },
208 =head2 genotyping_protocols()
210 =cut
212 has 'genotyping_protocols' => ( isa => 'Maybe[ArrayRef]',
213 is => 'rw',
214 predicate => 'has_genotyping_protocols',
217 =head2 trial_types()
219 =cut
221 has 'trial_types' => ( isa => 'Maybe[ArrayRef]',
222 is => 'rw',
223 predicate => 'has_trial_types',
226 =head2 trial_designs()
228 =cut
230 has 'trial_designs' => ( isa => 'Maybe[ArrayRef]',
231 is => 'rw',
232 predicate => 'has_trial_designs',
235 =head2 locations()
237 =cut
239 has 'locations' => ( isa => 'Maybe[ArrayRef]',
240 is => 'rw',
241 predicate => 'has_locations',
245 has 'category_order' => ( isa => 'Maybe[ArrayRef]',
246 is => 'rw',
247 predicate => 'has_category_order',
254 has 'is_live' => ( isa => 'Bool',
255 is => 'rw',
256 default => 0,
260 =head2 data_level()
262 =cut
264 has 'data_level' => ( isa => 'String',
265 is => 'rw',
266 isa => enum([qw[ plot plant subplot ]]),
267 default => 'plot',
270 =head2 exclude_phenotype_outlier()
272 =cut
274 has 'exclude_phenotype_outlier' => (
275 isa => 'Bool',
276 is => 'ro',
277 default => 0
280 has 'breeder_search' => (isa => 'CXGN::BreederSearch', is => 'rw');
283 sub BUILD {
284 my $self = shift;
286 my $bs = CXGN::BreederSearch->new(dbh => $self->schema->storage->dbh());
287 $self->breeder_search($bs);
289 if ($self->has_sp_dataset_id()) {
290 print STDERR "Processing dataset_id ".$self->sp_dataset_id()."\n";
291 my $row = $self->people_schema()->resultset("SpDataset")->find({ sp_dataset_id => $self->sp_dataset_id() });
292 if (!$row) { die "The dataset with id ".$self->sp_dataset_id()." does not exist"; }
293 my $dataset = JSON::Any->decode($row->dataset());
294 $self->data($dataset);
295 $self->name($row->name());
296 $self->description($row->description());
297 $self->sp_person_id($row->sp_person_id());
298 $self->accessions($dataset->{categories}->{accessions});
299 $self->plots($dataset->{categories}->{plots});
300 $self->plants($dataset->{categories}->{plants});
301 $self->trials($dataset->{categories}->{trials});
302 $self->traits($dataset->{categories}->{traits});
303 $self->years($dataset->{categories}->{years});
304 $self->locations($dataset->{categories}->{locations});
305 $self->breeding_programs($dataset->{categories}->{breeding_programs});
306 $self->genotyping_protocols($dataset->{categories}->{genotyping_protocols});
307 $self->trial_designs($dataset->{categories}->{trial_designs});
308 $self->trial_types($dataset->{categories}->{trial_types});
309 $self->category_order($dataset->{category_order});
310 $self->is_live($dataset->{is_live});
314 else { print STDERR "Creating empty dataset object\n"; }
320 =head1 CLASS METHODS
322 =head2 datasets_by_user()
325 =cut
327 sub get_datasets_by_user {
328 my $class = shift;
329 my $people_schema = shift;
330 my $sp_person_id = shift;
332 my $rs = $people_schema->resultset("SpDataset")->search( { sp_person_id => $sp_person_id });
334 my @datasets;
335 while (my $row = $rs->next()) {
336 push @datasets, [ $row->sp_dataset_id(), $row->name(), $row->description() ];
339 return \@datasets;
342 =head2 exists_dataset_name
344 Usage:
345 Desc:
346 Ret:
347 Args:
348 Side Effects:
349 Example:
351 =cut
353 sub exists_dataset_name {
354 my $class = shift;
355 my $people_schema = shift;
356 my $name = shift;
358 my $rs = $people_schema->resultset("SpDataset")->search( { name => { -ilike => $name}});
360 if ($rs->count() > 0) {
361 return 1;
363 else {
364 return 0;
369 =head1 METHODS
372 =head2 to_hashref()
375 =cut
377 sub to_hashref {
378 my $self = shift;
380 my $dataref = $self->get_dataset_data();
382 my $json = JSON::Any->encode($dataref);
384 my $data = {
385 name => $self->name(),
386 description => $self->description(),
387 sp_person_id => $self->sp_person_id(),
388 dataset => $json,
391 return $data;
396 =head2 store()
398 =cut
400 sub store {
401 my $self = shift;
405 print STDERR "dataset_id = ".$self->sp_dataset_id()."\n";
406 if (!$self->has_sp_dataset_id()) {
407 print STDERR "Creating new dataset row... ".$self->sp_dataset_id()."\n";
408 my $row = $self->people_schema()->resultset("SpDataset")->create($self->to_hashref());
409 $self->sp_dataset_id($row->sp_dataset_id());
410 return $row->sp_dataset_id();
412 else {
413 print STDERR "Updating dataset row ".$self->sp_dataset_id()."\n";
414 my $row = $self->people_schema()->resultset("SpDataset")->find( { sp_dataset_id => $self->sp_dataset_id() });
415 if ($row) {
416 $row->name($self->name());
417 $row->description($self->description());
418 $row->dataset(JSON::Any->encode($self->to_hashref()));
419 $row->sp_person_id($self->sp_person_id());
420 $row->update();
421 return $row->sp_dataset_id();
423 else {
424 print STDERR "Weird... has ".$self->sp_dataset_id()." but no data in db\n";
429 sub get_dataset_data {
430 my $self = shift;
431 my $dataref;
432 $dataref->{categories}->{accessions} = $self->accessions() if $self->has_accessions();
433 $dataref->{categories}->{plots} = $self->plots() if $self->has_plots();
434 $dataref->{categories}->{plants} = $self->plants() if $self->has_plants();
435 $dataref->{categories}->{trials} = $self->trials() if $self->has_trials();
436 $dataref->{categories}->{traits} = $self->traits() if $self->has_traits();
437 $dataref->{categories}->{years} = $self->years() if $self->has_years();
438 $dataref->{categories}->{breeding_programs} = $self->breeding_programs() if $self->has_breeding_programs();
439 $dataref->{categories}->{genotyping_protocols} = $self->genotyping_protocols() if $self->has_genotyping_protocols();
440 $dataref->{categories}->{trial_designs} = $self->trial_designs() if $self->has_trial_designs();
441 $dataref->{categories}->{trial_types} = $self->trial_types() if $self->has_trial_types();
442 $dataref->{categories}->{locations} = $self->locations() if $self->has_locations();
443 $dataref->{category_order} = $self->category_order();
444 return $dataref;
447 sub _get_dataref {
448 my $self = shift;
449 my $dataref;
451 $dataref->{accessions} = join(",", @{$self->accessions()}) if $self->has_accessions();
452 $dataref->{plots} = join(",", @{$self->plots()}) if $self->has_plots();
453 $dataref->{plants} = join(",", @{$self->plants()}) if $self->has_plants();
454 $dataref->{trials} = join(",", @{$self->trials()}) if $self->has_trials();
455 $dataref->{traits} = join(",", @{$self->traits()}) if $self->has_traits();
456 $dataref->{years} = join(",", @{$self->years()}) if $self->has_years();
457 $dataref->{breeding_programs} = join(",", @{$self->breeding_programs()}) if $self->has_breeding_programs();
458 $dataref->{genotyping_protocols} = join(",", @{$self->genotyping_protocols()}) if $self->has_genotyping_protocols();
459 $dataref->{trial_designs} = join(",", @{$self->trial_designs()}) if $self->has_trial_designs();
460 $dataref->{trial_types} = join(",", @{$self->trial_types()}) if $self->has_trial_types();
461 $dataref->{locations} = join(",", @{$self->locations()}) if $self->has_locations();
462 return $dataref;
465 sub _get_source_dataref {
466 my $self = shift;
467 my $source_type = shift;
469 my $dataref;
471 $dataref->{$source_type} = $self->_get_dataref();
473 return $dataref;
476 =head2 retrieve_genotypes()
478 Retrieves genotypes as a listref of hashrefs.
480 =cut
482 sub retrieve_genotypes {
483 my $self = shift;
484 my $protocol_id = shift;
485 my $genotypeprop_hash_select = shift || ['DS'];
486 my $protocolprop_top_key_select = shift || [];
487 my $protocolprop_marker_hash_select = shift || [];
488 my $return_only_first_genotypeprop_for_stock = shift || 1;
489 my $chromosome_list = shift || [];
490 my $start_position = shift;
491 my $end_position = shift;
492 my $marker_name_list = shift || [];
494 my $genotypes_search = CXGN::Genotype::Search->new(
495 bcs_schema => $self->schema(),
496 people_schema=>$self->people_schema,
497 accession_list => $self->accessions(),
498 trial_list => $self->trials(),
499 protocol_id_list => [$protocol_id],
500 chromosome_list => $chromosome_list,
501 start_position => $start_position,
502 end_position => $end_position,
503 marker_name_list => $marker_name_list,
504 genotypeprop_hash_select=>$genotypeprop_hash_select, #THESE ARE THE KEYS IN THE GENOTYPEPROP OBJECT
505 protocolprop_top_key_select=>$protocolprop_top_key_select, #THESE ARE THE KEYS AT THE TOP LEVEL OF THE PROTOCOLPROP OBJECT
506 protocolprop_marker_hash_select=>$protocolprop_marker_hash_select, #THESE ARE THE KEYS IN THE MARKERS OBJECT IN THE PROTOCOLPROP OBJECT
507 return_only_first_genotypeprop_for_stock=>$return_only_first_genotypeprop_for_stock #FOR MEMORY REASONS TO LIMIT DATA
509 my ($total_count, $dataref) = $genotypes_search->get_genotype_info();
510 return $dataref;
513 =head2 retrieve_phenotypes()
515 retrieves phenotypes as a listref of listrefs
517 =cut
519 sub retrieve_phenotypes {
520 my $self = shift;
521 my $phenotypes_search = CXGN::Phenotypes::PhenotypeMatrix->new(
522 search_type=>'MaterializedViewTable',
523 bcs_schema=>$self->schema(),
524 data_level=>$self->data_level(),
525 trait_list=>$self->traits(),
526 trial_list=>$self->trials(),
527 accession_list=>$self->accessions(),
528 exclude_phenotype_outlier=>$self->exclude_phenotype_outlier
530 my @data = $phenotypes_search->get_phenotype_matrix();
531 return \@data;
534 =head2 retrieve_accessions()
536 retrieves accessions as a listref of listref [stock_id, uniquname]
538 =cut
540 sub retrieve_accessions {
541 my $self = shift;
542 my $accessions;
543 if ($self->has_accessions()) {
544 return $self->accessions();
546 else {
547 my $criteria = $self->get_dataset_definition();
548 push @$criteria, "accessions";
550 $accessions = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("accessions"));
552 return $accessions->{results};
555 =head2 retrieve_plots()
557 Retrieves plots as a listref of listrefs.
559 =cut
561 sub retrieve_plots {
562 my $self = shift;
563 my $plots;
564 if ($self->has_plots()) {
565 return $self->plots();
567 else {
568 my $criteria = $self->get_dataset_definition();
569 push @$criteria, "plots";
570 $plots = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("plots"));
572 return $plots->{results};
575 =head2 retrieve_plants()
577 Retrieves plants as a listref of listrefs.
579 =cut
581 sub retrieve_plants {
582 my $self = shift;
583 my $plants;
584 if ($self->has_plants()) {
585 return $self->plants();
587 else {
588 my $criteria = $self->get_dataset_definition();
589 push @$criteria, "plants";
590 $plants = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("plants"));
592 return $plants->{results};
595 =head2 retrieve_trials()
597 retrieves trials as a listref of listrefs.
599 =cut
601 sub retrieve_trials {
602 my $self = shift;
603 my $trials;
604 if ($self->has_trials()) {
605 return $self->trials();
607 else {
608 my $criteria = $self->get_dataset_definition();
609 push @$criteria, "trials";
610 $trials = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("trials"));
612 print STDERR "TRIALS: ".Dumper($trials);
613 return $trials->{results};
616 =head2 retrieve_traits()
618 retrieves traits as a listref of listrefs.
620 =cut
622 sub retrieve_traits {
623 my $self = shift;
624 my $traits;
625 if ($self->has_traits()) {
626 return $self->traits();
628 else {
629 my $criteria = $self->get_dataset_definition();
630 push @$criteria, "traits";
631 $traits = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("traits"));
633 return $traits->{results};
637 =head2 retrieve_years()
639 retrieves years as a listref of listrefs
641 =cut
643 sub retrieve_years {
644 my $self = shift;
645 my @years;
646 if ($self->has_years()) {
647 return $self->years();
649 else {
650 my $criteria = $self->get_dataset_definition();
651 push @$criteria, "years";
652 my $year_data = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("years"));
653 my $year_list = $year_data->{result};
655 foreach my $y (@$year_list) {
656 push @years, $y->[0];
659 return \@years;
662 =head2 retrieve_years()
664 retrieves years as a listref of listrefs
666 =cut
668 sub retrieve_locations {
669 my $self = shift;
670 my @locations;
671 if ($self->has_locations()) {
672 return $self->locations();
674 else {
675 my $criteria = $self->get_dataset_definition();
676 push @$criteria, "locations";
677 my $location_data = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("locations"));
678 my $location_list = $location_data->{result};
680 foreach my $y (@$location_list) {
681 push @locations, $y->[0];
684 return \@locations;
687 =head2 retrieve_breeding_programs
689 Usage:
690 Desc:
691 Ret:
692 Args:
693 Side Effects:
694 Example:
696 =cut
698 sub retrieve_breeding_programs {
699 my $self = shift;
700 my @breeding_programs;
701 if ($self->has_breeding_programs()) {
702 return $self->breeding_programs();
704 else {
705 my $criteria = $self->get_dataset_definition();
706 push @$criteria, "breeding_programs";
707 my $breeding_program_data = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("breeding_programs"));
708 my $breeding_program_list = $breeding_program_data->{result};
710 foreach my $y (@$breeding_program_list) {
711 push @breeding_programs, $y->[0];
714 return \@breeding_programs;
717 =head2 retrieve_genotyping_protocols
719 Usage:
720 Desc:
721 Ret:
722 Args:
723 Side Effects:
724 Example:
726 =cut
728 sub retrieve_genotyping_protocols {
729 my $self = shift;
730 my @genotyping_protocols;
731 if ($self->has_genotyping_protocols()) {
732 return $self->genotyping_protocols();
734 else {
735 my $criteria = $self->get_dataset_definition();
736 push @$criteria, "genotyping_protocols";
737 my $breeding_program_data = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("genotyping_protocols"));
738 my $breeding_program_list = $breeding_program_data->{result};
740 foreach my $y (@$breeding_program_list) {
741 push @genotyping_protocols, $y->[0];
744 return \@genotyping_protocols;
747 =head2 retrieve_trial_designs
749 Usage:
750 Desc:
751 Ret:
752 Args:
753 Side Effects:
754 Example:
756 =cut
758 sub retrieve_trial_designs {
759 my $self = shift;
760 my @trial_designs;
761 if ($self->has_trial_designs()) {
762 return $self->trial_designs();
764 else {
765 my $criteria = $self->get_dataset_definition();
766 push @$criteria, "trial_designs";
767 my $breeding_program_data = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("trial_designs"));
768 my $breeding_program_list = $breeding_program_data->{result};
770 foreach my $y (@$breeding_program_list) {
771 push @trial_designs, $y->[0];
774 return \@trial_designs;
778 =head2 retrieve_trial_types
780 Usage:
781 Desc:
782 Ret:
783 Args:
784 Side Effects:
785 Example:
787 =cut
789 sub retrieve_trial_types {
790 my $self = shift;
791 my @trial_types;
792 if ($self->has_trial_types()) {
793 return $self->trial_types();
795 else {
796 my $criteria = $self->get_dataset_definition();
797 push @$criteria, "trial_types";
798 my $breeding_program_data = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("trial_types"));
799 my $breeding_program_list = $breeding_program_data->{result};
801 foreach my $y (@$breeding_program_list) {
802 push @trial_types, $y->[0];
805 return \@trial_types;
809 sub get_dataset_definition {
810 my $self = shift;
811 my @criteria;
813 if ($self->has_accessions()) {
814 push @criteria, "accessions";
816 if ($self->has_plots()) {
817 push @criteria, "plots";
819 if ($self->has_plants()) {
820 push @criteria, "plants";
822 if ($self->has_trials()) {
823 push @criteria, "trials";
825 if ($self->has_traits()) {
826 push @criteria, "traits";
828 if ($self->has_years()) {
829 push @criteria, "years";
831 if ($self->has_locations()) {
832 push @criteria, "locations";
834 if ($self->has_breeding_programs()) {
835 push @criteria, "breeding_programs";
837 if ($self->has_genotyping_protocols()) {
838 push @criteria, "genotyping_protocols";
840 if ($self->has_trial_types()) {
841 push @criteria, "trial_types";
843 if ($self->has_trial_designs()) {
844 push @criteria, "trial_designs";
848 return \@criteria;
852 =head2 delete()
854 Usage: $dataset->delete();
855 Desc: Deletes the specified dataset. Returns a string with an
856 error message is unsuccessful.
857 Ret: string if failure, undef if success
858 Args:
859 Side Effects: The function does not check for ownership of the dataset,
860 this has to be implemented in the calling function.
861 Example:
863 =cut
865 sub delete {
866 my $self = shift;
868 my $row = $self->people_schema()->resultset("SpDataset")->find( { sp_dataset_id => $self->sp_dataset_id() });
870 if (! $row) {
871 return "The specified dataset does not exist";
874 else {
875 eval {
876 $row->delete();
878 if ($@) {
879 return "An error occurred, $@";
882 else {
883 return undef;