ignore emacs backup files also in db/run_all_patches.pl
[sgn.git] / lib / CXGN / Dataset.pm
blobb59a73dd465d688f77d6f1e7a14ce4a9203c748a
2 =head1 NAME
4 CXGN::Dataset - a class to easily query the database for breeding data
6 =head1 DESCRIPTION
8 CXGN::Dataset can be used to flexibly define datasets for breeding applications. For example, a dataset can be defined using a list of germplasm, a list of trials, a list of years, etc, or a combination of the above. Once defined, it allows to easily obtain related phenotypes and genotypes and other data.
10 Datasets can be stored in the database and retrieved for later use.
12 Currently, there are three incarnations of CXGN::Dataset:
14 =over 5
16 =item CXGN::Dataset
18 Unbuffered output of the queries
20 =item CXGN::Dataset::File
22 Writes results to files
24 =item CXGN::Dataset::Cache
26 Returns output like CXGN::Dataset, but uses a disk-cache for the response data
28 =back
30 =head1 SYNOPSYS
32 my $ds = CXGN::Dataset->new( people_schema => $p, schema => $s);
33 $ds->accessions([ 'a', 'b', 'c' ]);
34 my $trials = $ds->retrieve_trials();
35 my $sp_dataset_id = $ds->store();
36 #...
37 my $restored_ds = CXGN::Dataset( people_schema => $p, schema => $s, sp_dataset_id => $sp_dataset_id );
38 my $years = $restored_ds->retrieve_years();
39 #...
41 =head1 AUTHOR
43 Lukas Mueller <lam87@cornell.edu>
46 =head1 ACCESSORS
48 =cut
51 package CXGN::Dataset;
53 use Moose;
54 use Moose::Util::TypeConstraints;
55 use Data::Dumper;
56 use JSON::Any;
57 use CXGN::BreederSearch;
58 use CXGN::People::Schema;
59 use CXGN::Phenotypes::PhenotypeMatrix;
60 use CXGN::Genotype::Search;
62 =head2 people_schema()
64 accessor for CXGN::People::Schema database object
66 =cut
68 has 'people_schema' => (isa => 'CXGN::People::Schema', is => 'rw', required => 1 );
70 =head2 schema()
72 accessor for Bio::Chado::Schema database object
74 =cut
76 has 'schema' => ( isa => "Bio::Chado::Schema", is => 'rw', required => 1 );
78 =head2 sp_dataset_id()
80 accessor for sp_dataset primary key
82 =cut
85 has 'sp_dataset_id' => ( isa => 'Int',
86 is => 'rw',
87 predicate => 'has_sp_dataset_id',
90 =head2 data()
92 accessor for the json-formatted data structure (as used for the backend storage)
94 =cut
96 has 'data' => ( isa => 'HashRef',
97 is => 'rw'
100 =head2 name()
102 accessor for the name of this dataset
104 =cut
106 has 'name' => ( isa => 'Maybe[Str]',
107 is => 'rw',
110 =head2 description()
112 accessor for the descrition of this dataset
114 =cut
116 has 'description' => ( isa => 'Maybe[Str]',
117 is => 'rw'
120 =head2 sp_person_id()
122 accessor for sp_person_id (owner of the dataset)
124 =cut
126 has 'sp_person_id' => ( isa => 'Maybe[Int]',
127 is => 'rw',
131 =head2 accessions()
133 accessor for defining the accessions that are part of this dataset (ArrayRef).
135 =cut
137 has 'accessions' => ( isa => 'Maybe[ArrayRef]',
138 is => 'rw',
139 predicate => 'has_accessions',
142 =head2 plots()
144 accessor for defining the plots that are part of this dataset (ArrayRef).
146 =cut
148 has 'plots' => ( isa => 'Maybe[ArrayRef]',
149 is => 'rw',
150 predicate => 'has_plots',
154 =head2 trials()
156 accessor for defining the trials that are part of this dataset (ArrayRef).
158 =cut
161 has 'trials' => ( isa => 'Maybe[ArrayRef]',
162 is => 'rw',
163 predicate => 'has_trials',
167 =head2 traits()
169 =cut
171 has 'traits' => ( isa => 'Maybe[ArrayRef]',
172 is => 'rw',
173 predicate => 'has_traits',
176 =head2 years()
178 =cut
181 has 'years' => ( isa => 'Maybe[ArrayRef]',
182 is => 'rw',
183 predicate => 'has_years',
186 =head2 breeding_programs()
188 =cut
190 has 'breeding_programs' => ( isa => 'Maybe[ArrayRef]',
191 is => 'rw',
192 predicate => 'has_breeding_programs',
193 default => sub { [] },
196 =head2 genotyping_protocols()
198 =cut
200 has 'genotyping_protocols' => ( isa => 'Maybe[ArrayRef]',
201 is => 'rw',
202 predicate => 'has_genotyping_protocols',
205 =head2 trial_types()
207 =cut
209 has 'trial_types' => ( isa => 'Maybe[ArrayRef]',
210 is => 'rw',
211 predicate => 'has_trial_types',
214 =head2 trial_designs()
216 =cut
218 has 'trial_designs' => ( isa => 'Maybe[ArrayRef]',
219 is => 'rw',
220 predicate => 'has_trial_designs',
223 =head2 locations()
225 =cut
227 has 'locations' => ( isa => 'Maybe[ArrayRef]',
228 is => 'rw',
229 predicate => 'has_locations',
233 has 'category_order' => ( isa => 'Maybe[ArrayRef]',
234 is => 'rw',
235 predicate => 'has_category_order',
242 has 'is_live' => ( isa => 'Bool',
243 is => 'rw',
244 default => 0,
248 =head2 data_level()
250 =cut
252 has 'data_level' => ( isa => 'String',
253 is => 'rw',
254 isa => enum([qw[ plot plant subplot ]]),
255 default => 'plot',
258 =head2 exclude_phenotype_outlier()
260 =cut
262 has 'exclude_phenotype_outlier' => (
263 isa => 'Bool',
264 is => 'ro',
265 default => 0
268 has 'breeder_search' => (isa => 'CXGN::BreederSearch', is => 'rw');
271 sub BUILD {
272 my $self = shift;
274 print STDERR "Processing dataset_id ".$self->sp_dataset_id()."\n";
275 my $bs = CXGN::BreederSearch->new(dbh => $self->schema->storage->dbh());
276 $self->breeder_search($bs);
278 if ($self->has_sp_dataset_id()) {
279 my $row = $self->people_schema()->resultset("SpDataset")->find({ sp_dataset_id => $self->sp_dataset_id() });
280 if (!$row) { die "The dataset with id ".$self->sp_dataset_id()." does not exist"; }
281 my $dataset = JSON::Any->decode($row->dataset());
282 $self->data($dataset);
283 $self->name($row->name());
284 $self->description($row->description());
285 $self->sp_person_id($row->sp_person_id());
286 $self->accessions($dataset->{categories}->{accessions});
287 $self->plots($dataset->{categories}->{plots});
288 $self->trials($dataset->{categories}->{trials});
289 $self->traits($dataset->{categories}->{traits});
290 $self->years($dataset->{categories}->{years});
291 $self->locations($dataset->{categories}->{locations});
292 $self->breeding_programs($dataset->{categories}->{breeding_programs});
293 $self->genotyping_protocols($dataset->{categories}->{genotyping_protocols});
294 $self->trial_designs($dataset->{categories}->{trial_designs});
295 $self->trial_types($dataset->{categories}->{trial_types});
296 $self->category_order($dataset->{category_order});
297 $self->is_live($dataset->{is_live});
301 else { print STDERR "Creating empty dataset object\n"; }
307 =head1 CLASS METHODS
309 =head2 datasets_by_user()
312 =cut
314 sub get_datasets_by_user {
315 my $class = shift;
316 my $people_schema = shift;
317 my $sp_person_id = shift;
319 my $rs = $people_schema->resultset("SpDataset")->search( { sp_person_id => $sp_person_id });
321 my @datasets;
322 while (my $row = $rs->next()) {
323 push @datasets, [ $row->sp_dataset_id(), $row->name(), $row->description() ];
326 return \@datasets;
329 =head2 exists_dataset_name
331 Usage:
332 Desc:
333 Ret:
334 Args:
335 Side Effects:
336 Example:
338 =cut
340 sub exists_dataset_name {
341 my $class = shift;
342 my $people_schema = shift;
343 my $name = shift;
345 my $rs = $people_schema->resultset("SpDataset")->search( { name => { -ilike => $name}});
347 if ($rs->count() > 0) {
348 return 1;
350 else {
351 return 0;
356 =head1 METHODS
358 =head2 store()
360 =cut
362 sub store {
363 my $self = shift;
365 my $dataref = $self->get_dataset_data();
367 my $json = JSON::Any->encode($dataref);
369 my $data = { name => $self->name(),
370 description => $self->description(),
371 sp_person_id => $self->sp_person_id(),
372 dataset => $json,
377 print STDERR "dataset_id = ".$self->sp_dataset_id()."\n";
378 if (!$self->has_sp_dataset_id()) {
379 print STDERR "Creating new dataset row... ".$self->sp_dataset_id()."\n";
380 my $row = $self->people_schema()->resultset("SpDataset")->create($data);
381 $self->sp_dataset_id($row->sp_dataset_id());
382 return $row->sp_dataset_id();
384 else {
385 print STDERR "Updating dataset row ".$self->sp_dataset_id()."\n";
386 my $row = $self->people_schema()->resultset("SpDataset")->find( { sp_dataset_id => $self->sp_dataset_id() });
387 if ($row) {
388 $row->name($self->name());
389 $row->description($self->description());
390 $row->dataset($json);
391 $row->sp_person_id($self->sp_person_id());
392 $row->update();
393 return $row->sp_dataset_id();
395 else {
396 print STDERR "Weird... has ".$self->sp_dataset_id()." but no data in db\n";
401 sub get_dataset_data {
402 my $self = shift;
403 my $dataref;
404 $dataref->{categories}->{accessions} = $self->accessions() if $self->has_accessions();
405 $dataref->{categories}->{plots} = $self->plots() if $self->has_plots();
406 $dataref->{categories}->{trials} = $self->trials() if $self->has_trials();
407 $dataref->{categories}->{traits} = $self->traits() if $self->has_traits();
408 $dataref->{categories}->{years} = $self->years() if $self->has_years();
409 $dataref->{categories}->{breeding_programs} = $self->breeding_programs() if $self->has_breeding_programs();
410 $dataref->{categories}->{genotyping_protocols} = $self->genotyping_protocols() if $self->has_genotyping_protocols();
411 $dataref->{categories}->{trial_designs} = $self->trial_designs() if $self->has_trial_designs();
412 $dataref->{categories}->{trial_types} = $self->trial_types() if $self->has_trial_types();
413 $dataref->{categories}->{locations} = $self->locations() if $self->has_locations();
414 $dataref->{category_order} = $self->category_order();
415 return $dataref;
418 sub _get_dataref {
419 my $self = shift;
420 my $dataref;
422 $dataref->{accessions} = join(",", @{$self->accessions()}) if $self->has_accessions();
423 $dataref->{plots} = join(",", @{$self->plots()}) if $self->has_plots();
424 $dataref->{trials} = join(",", @{$self->trials()}) if $self->has_trials();
425 $dataref->{traits} = join(",", @{$self->traits()}) if $self->has_traits();
426 $dataref->{years} = join(",", @{$self->years()}) if $self->has_years();
427 $dataref->{breeding_programs} = join(",", @{$self->breeding_programs()}) if $self->has_breeding_programs();
428 $dataref->{genotyping_protocols} = join(",", @{$self->genotyping_protocols()}) if $self->has_genotyping_protocols();
429 $dataref->{trial_designs} = join(",", @{$self->trial_designs()}) if $self->has_trial_designs();
430 $dataref->{trial_types} = join(",", @{$self->trial_types()}) if $self->has_trial_types();
431 $dataref->{locations} = join(",", @{$self->locations()}) if $self->has_locations();
432 return $dataref;
435 sub _get_source_dataref {
436 my $self = shift;
437 my $source_type = shift;
439 my $dataref;
441 $dataref->{$source_type} = $self->_get_dataref();
443 return $dataref;
446 =head2 retrieve_genotypes()
448 Retrieves genotypes as a listref of hashrefs.
450 =cut
452 sub retrieve_genotypes {
453 my $self = shift;
454 my $protocol_id = shift;
456 my $genotypes_search = CXGN::Genotype::Search->new(
457 bcs_schema => $self->schema(),
458 accession_list => $self->accessions(),
459 trial_list => $self->trials(),
460 protocol_id_list => [$protocol_id]
462 my ($total_count, $dataref) = $genotypes_search->get_genotype_info();
463 return $dataref;
466 =head2 retrieve_phenotypes()
468 retrieves phenotypes as a listref of listrefs
470 =cut
472 sub retrieve_phenotypes {
473 my $self = shift;
474 my $phenotypes_search = CXGN::Phenotypes::PhenotypeMatrix->new(
475 search_type=>'MaterializedViewTable',
476 bcs_schema=>$self->schema(),
477 data_level=>$self->data_level(),
478 trait_list=>$self->traits(),
479 trial_list=>$self->trials(),
480 accession_list=>$self->accessions(),
481 exclude_phenotype_outlier=>$self->exclude_phenotype_outlier
483 my @data = $phenotypes_search->get_phenotype_matrix();
484 return \@data;
487 =head2 retrieve_accessions()
489 retrieves accessions as a listref of listref [stock_id, uniquname]
491 =cut
493 sub retrieve_accessions {
494 my $self = shift;
495 my $accessions;
496 if ($self->has_accessions()) {
497 return $self->accessions();
499 else {
500 my $criteria = $self->get_dataset_definition();
501 push @$criteria, "accessions";
503 $accessions = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("accessions"));
505 return $accessions->{results};
508 =head2 retrieve_plots()
510 Retrieves plots as a listref of listrefs.
512 =cut
514 sub retrieve_plots {
515 my $self = shift;
516 my $plots;
517 if ($self->has_plots()) {
518 return $self->plots();
520 else {
521 my $criteria = $self->get_dataset_definition();
522 push @$criteria, "plots";
523 $plots = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("plots"));
525 return $plots->{results};
528 =head2 retrieve_trials()
530 retrieves trials as a listref of listrefs.
532 =cut
534 sub retrieve_trials {
535 my $self = shift;
536 my $trials;
537 if ($self->has_trials()) {
538 return $self->trials();
540 else {
541 my $criteria = $self->get_dataset_definition();
542 push @$criteria, "trials";
543 $trials = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("trials"));
545 print STDERR "TRIALS: ".Dumper($trials);
546 return $trials->{results};
549 =head2 retrieve_traits()
551 retrieves traits as a listref of listrefs.
553 =cut
555 sub retrieve_traits {
556 my $self = shift;
557 my $traits;
558 if ($self->has_traits()) {
559 return $self->traits();
561 else {
562 my $criteria = $self->get_dataset_definition();
563 push @$criteria, "traits";
564 $traits = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("traits"));
566 return $traits->{results};
570 =head2 retrieve_years()
572 retrieves years as a listref of listrefs
574 =cut
576 sub retrieve_years {
577 my $self = shift;
578 my @years;
579 if ($self->has_years()) {
580 return $self->years();
582 else {
583 my $criteria = $self->get_dataset_definition();
584 push @$criteria, "years";
585 my $year_data = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("years"));
586 my $year_list = $year_data->{result};
588 foreach my $y (@$year_list) {
589 push @years, $y->[0];
592 return \@years;
595 =head2 retrieve_years()
597 retrieves years as a listref of listrefs
599 =cut
601 sub retrieve_locations {
602 my $self = shift;
603 my @locations;
604 if ($self->has_locations()) {
605 return $self->locations();
607 else {
608 my $criteria = $self->get_dataset_definition();
609 push @$criteria, "locations";
610 my $location_data = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("locations"));
611 my $location_list = $location_data->{result};
613 foreach my $y (@$location_list) {
614 push @locations, $y->[0];
617 return \@locations;
620 =head2 retrieve_breeding_programs
622 Usage:
623 Desc:
624 Ret:
625 Args:
626 Side Effects:
627 Example:
629 =cut
631 sub retrieve_breeding_programs {
632 my $self = shift;
633 my @breeding_programs;
634 if ($self->has_breeding_programs()) {
635 return $self->breeding_programs();
637 else {
638 my $criteria = $self->get_dataset_definition();
639 push @$criteria, "breeding_programs";
640 my $breeding_program_data = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("breeding_programs"));
641 my $breeding_program_list = $breeding_program_data->{result};
643 foreach my $y (@$breeding_program_list) {
644 push @breeding_programs, $y->[0];
647 return \@breeding_programs;
650 =head2 retrieve_genotyping_protocols
652 Usage:
653 Desc:
654 Ret:
655 Args:
656 Side Effects:
657 Example:
659 =cut
661 sub retrieve_genotyping_protocols {
662 my $self = shift;
663 my @genotyping_protocols;
664 if ($self->has_genotyping_protocols()) {
665 return $self->genotyping_protocols();
667 else {
668 my $criteria = $self->get_dataset_definition();
669 push @$criteria, "genotyping_protocols";
670 my $breeding_program_data = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("genotyping_protocols"));
671 my $breeding_program_list = $breeding_program_data->{result};
673 foreach my $y (@$breeding_program_list) {
674 push @genotyping_protocols, $y->[0];
677 return \@genotyping_protocols;
680 =head2 retrieve_trial_designs
682 Usage:
683 Desc:
684 Ret:
685 Args:
686 Side Effects:
687 Example:
689 =cut
691 sub retrieve_trial_designs {
692 my $self = shift;
693 my @trial_designs;
694 if ($self->has_trial_designs()) {
695 return $self->trial_designs();
697 else {
698 my $criteria = $self->get_dataset_definition();
699 push @$criteria, "trial_designs";
700 my $breeding_program_data = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("trial_designs"));
701 my $breeding_program_list = $breeding_program_data->{result};
703 foreach my $y (@$breeding_program_list) {
704 push @trial_designs, $y->[0];
707 return \@trial_designs;
711 =head2 retrieve_trial_types
713 Usage:
714 Desc:
715 Ret:
716 Args:
717 Side Effects:
718 Example:
720 =cut
722 sub retrieve_trial_types {
723 my $self = shift;
724 my @trial_types;
725 if ($self->has_trial_types()) {
726 return $self->trial_types();
728 else {
729 my $criteria = $self->get_dataset_definition();
730 push @$criteria, "trial_types";
731 my $breeding_program_data = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("trial_types"));
732 my $breeding_program_list = $breeding_program_data->{result};
734 foreach my $y (@$breeding_program_list) {
735 push @trial_types, $y->[0];
738 return \@trial_types;
742 sub get_dataset_definition {
743 my $self = shift;
744 my @criteria;
746 if ($self->has_accessions()) {
747 push @criteria, "accessions";
749 if ($self->has_plots()) {
750 push @criteria, "plots";
752 if ($self->has_trials()) {
753 push @criteria, "trials";
755 if ($self->has_traits()) {
756 push @criteria, "traits";
758 if ($self->has_years()) {
759 push @criteria, "years";
761 if ($self->has_locations()) {
762 push @criteria, "locations";
764 if ($self->has_breeding_programs()) {
765 push @criteria, "breeding_programs";
767 if ($self->has_genotyping_protocols()) {
768 push @criteria, "genotyping_protocols";
770 if ($self->has_trial_types()) {
771 push @criteria, "trial_types";
773 if ($self->has_trial_designs()) {
774 push @criteria, "trial_designs";
778 return \@criteria;
782 =head2 delete()
784 Usage: $dataset->delete();
785 Desc: Deletes the specified dataset. Returns a string with an
786 error message is unsuccessful.
787 Ret: string if failure, undef if success
788 Args:
789 Side Effects: The function does not check for ownership of the dataset,
790 this has to be implemented in the calling function.
791 Example:
793 =cut
795 sub delete {
796 my $self = shift;
798 my $row = $self->people_schema()->resultset("SpDataset")->find( { sp_dataset_id => $self->sp_dataset_id() });
800 if (! $row) {
801 return "The specified dataset does not exist";
804 else {
805 eval {
806 $row->delete();
808 if ($@) {
809 return "An error occurred, $@";
812 else {
813 return undef;