4 CXGN::Dataset - a class to easily query the database for breeding data
8 CXGN::Dataset can be used to flexibly define datasets for breeding applications. For example, a dataset can be defined using a list of germplasm, a list of trials, a list of years, etc, or a combination of the above. Once defined, it allows to easily obtain related phenotypes and genotypes and other data.
10 Datasets can be stored in the database and retrieved for later use.
12 Currently, there are three incarnations of CXGN::Dataset:
18 Unbuffered output of the queries
20 =item CXGN::Dataset::File
22 Writes results to files
24 =item CXGN::Dataset::Cache
26 Returns output like CXGN::Dataset, but uses a disk-cache for the response data
32 my $ds = CXGN::Dataset->new( people_schema => $p, schema => $s);
33 $ds->accessions([ 'a', 'b', 'c' ]);
34 my $trials = $ds->retrieve_trials();
35 my $sp_dataset_id = $ds->store();
37 my $restored_ds = CXGN::Dataset( people_schema => $p, schema => $s, sp_dataset_id => $sp_dataset_id );
38 my $years = $restored_ds->retrieve_years();
43 Lukas Mueller <lam87@cornell.edu>
51 package CXGN
::Dataset
;
54 use Moose
::Util
::TypeConstraints
;
57 use CXGN
::BreederSearch
;
58 use CXGN
::People
::Schema
;
59 use CXGN
::Phenotypes
::PhenotypeMatrix
;
60 use CXGN
::Genotype
::Search
;
62 =head2 people_schema()
64 accessor for CXGN::People::Schema database object
68 has
'people_schema' => (isa
=> 'CXGN::People::Schema', is
=> 'rw', required
=> 1 );
72 accessor for Bio::Chado::Schema database object
76 has
'schema' => ( isa
=> "Bio::Chado::Schema", is
=> 'rw', required
=> 1 );
78 =head2 sp_dataset_id()
80 accessor for sp_dataset primary key
85 has
'sp_dataset_id' => ( isa
=> 'Int',
87 predicate
=> 'has_sp_dataset_id',
92 accessor for the json-formatted data structure (as used for the backend storage)
96 has
'data' => ( isa
=> 'HashRef',
102 accessor for the name of this dataset
106 has
'name' => ( isa
=> 'Maybe[Str]',
112 accessor for the descrition of this dataset
116 has
'description' => ( isa
=> 'Maybe[Str]',
120 =head2 sp_person_id()
122 accessor for sp_person_id (owner of the dataset)
126 has
'sp_person_id' => ( isa
=> 'Maybe[Int]',
133 accessor for defining the accessions that are part of this dataset (ArrayRef).
137 has
'accessions' => ( isa
=> 'Maybe[ArrayRef]',
139 predicate
=> 'has_accessions',
144 accessor for defining the plots that are part of this dataset (ArrayRef).
148 has
'plots' => ( isa
=> 'Maybe[ArrayRef]',
150 predicate
=> 'has_plots',
155 accessor for defining the plants that are part of this dataset (ArrayRef).
159 has
'plants' => ( isa
=> 'Maybe[ArrayRef]',
161 predicate
=> 'has_plants',
168 accessor for defining the trials that are part of this dataset (ArrayRef).
173 has
'trials' => ( isa
=> 'Maybe[ArrayRef]',
175 predicate
=> 'has_trials',
183 has
'traits' => ( isa
=> 'Maybe[ArrayRef]',
185 predicate
=> 'has_traits',
193 has
'years' => ( isa
=> 'Maybe[ArrayRef]',
195 predicate
=> 'has_years',
198 =head2 breeding_programs()
202 has
'breeding_programs' => ( isa
=> 'Maybe[ArrayRef]',
204 predicate
=> 'has_breeding_programs',
205 default => sub { [] },
208 =head2 genotyping_protocols()
212 has
'genotyping_protocols' => ( isa
=> 'Maybe[ArrayRef]',
214 predicate
=> 'has_genotyping_protocols',
221 has
'trial_types' => ( isa
=> 'Maybe[ArrayRef]',
223 predicate
=> 'has_trial_types',
226 =head2 trial_designs()
230 has
'trial_designs' => ( isa
=> 'Maybe[ArrayRef]',
232 predicate
=> 'has_trial_designs',
239 has
'locations' => ( isa
=> 'Maybe[ArrayRef]',
241 predicate
=> 'has_locations',
245 has
'category_order' => ( isa
=> 'Maybe[ArrayRef]',
247 predicate
=> 'has_category_order',
254 has
'is_live' => ( isa
=> 'Bool',
264 has
'data_level' => ( isa
=> 'String',
266 isa
=> enum
([qw
[ plot plant subplot
]]),
270 =head2 exclude_phenotype_outlier()
274 has
'exclude_phenotype_outlier' => (
280 has
'breeder_search' => (isa
=> 'CXGN::BreederSearch', is
=> 'rw');
286 my $bs = CXGN
::BreederSearch
->new(dbh
=> $self->schema->storage->dbh());
287 $self->breeder_search($bs);
289 if ($self->has_sp_dataset_id()) {
290 print STDERR
"Processing dataset_id ".$self->sp_dataset_id()."\n";
291 my $row = $self->people_schema()->resultset("SpDataset")->find({ sp_dataset_id
=> $self->sp_dataset_id() });
292 if (!$row) { die "The dataset with id ".$self->sp_dataset_id()." does not exist"; }
293 my $dataset = JSON
::Any
->decode($row->dataset());
294 $self->data($dataset);
295 $self->name($row->name());
296 $self->description($row->description());
297 $self->sp_person_id($row->sp_person_id());
298 $self->accessions($dataset->{categories
}->{accessions
});
299 $self->plots($dataset->{categories
}->{plots
});
300 $self->plants($dataset->{categories
}->{plants
});
301 $self->trials($dataset->{categories
}->{trials
});
302 $self->traits($dataset->{categories
}->{traits
});
303 $self->years($dataset->{categories
}->{years
});
304 $self->locations($dataset->{categories
}->{locations
});
305 $self->breeding_programs($dataset->{categories
}->{breeding_programs
});
306 $self->genotyping_protocols($dataset->{categories
}->{genotyping_protocols
});
307 $self->trial_designs($dataset->{categories
}->{trial_designs
});
308 $self->trial_types($dataset->{categories
}->{trial_types
});
309 $self->category_order($dataset->{category_order
});
310 $self->is_live($dataset->{is_live
});
314 else { print STDERR
"Creating empty dataset object\n"; }
322 =head2 datasets_by_user()
327 sub get_datasets_by_user
{
329 my $people_schema = shift;
330 my $sp_person_id = shift;
332 my $rs = $people_schema->resultset("SpDataset")->search( { sp_person_id
=> $sp_person_id });
335 while (my $row = $rs->next()) {
336 push @datasets, [ $row->sp_dataset_id(), $row->name(), $row->description() ];
342 =head2 exists_dataset_name
353 sub exists_dataset_name
{
355 my $people_schema = shift;
358 my $rs = $people_schema->resultset("SpDataset")->search( { name
=> { -ilike
=> $name}});
360 if ($rs->count() > 0) {
380 my $dataref = $self->get_dataset_data();
382 my $json = JSON
::Any
->encode($dataref);
385 name
=> $self->name(),
386 description
=> $self->description(),
387 sp_person_id
=> $self->sp_person_id(),
405 print STDERR
"dataset_id = ".$self->sp_dataset_id()."\n";
406 if (!$self->has_sp_dataset_id()) {
407 print STDERR
"Creating new dataset row... ".$self->sp_dataset_id()."\n";
408 my $row = $self->people_schema()->resultset("SpDataset")->create($self->to_hashref());
409 $self->sp_dataset_id($row->sp_dataset_id());
410 return $row->sp_dataset_id();
413 print STDERR
"Updating dataset row ".$self->sp_dataset_id()."\n";
414 my $row = $self->people_schema()->resultset("SpDataset")->find( { sp_dataset_id
=> $self->sp_dataset_id() });
416 $row->name($self->name());
417 $row->description($self->description());
418 $row->dataset(JSON
::Any
->encode($self->to_hashref()));
419 $row->sp_person_id($self->sp_person_id());
421 return $row->sp_dataset_id();
424 print STDERR
"Weird... has ".$self->sp_dataset_id()." but no data in db\n";
429 sub get_dataset_data
{
432 $dataref->{categories
}->{accessions
} = $self->accessions() if $self->has_accessions();
433 $dataref->{categories
}->{plots
} = $self->plots() if $self->has_plots();
434 $dataref->{categories
}->{plants
} = $self->plants() if $self->has_plants();
435 $dataref->{categories
}->{trials
} = $self->trials() if $self->has_trials();
436 $dataref->{categories
}->{traits
} = $self->traits() if $self->has_traits();
437 $dataref->{categories
}->{years
} = $self->years() if $self->has_years();
438 $dataref->{categories
}->{breeding_programs
} = $self->breeding_programs() if $self->has_breeding_programs();
439 $dataref->{categories
}->{genotyping_protocols
} = $self->genotyping_protocols() if $self->has_genotyping_protocols();
440 $dataref->{categories
}->{trial_designs
} = $self->trial_designs() if $self->has_trial_designs();
441 $dataref->{categories
}->{trial_types
} = $self->trial_types() if $self->has_trial_types();
442 $dataref->{categories
}->{locations
} = $self->locations() if $self->has_locations();
443 $dataref->{category_order
} = $self->category_order();
451 $dataref->{accessions
} = join(",", @
{$self->accessions()}) if $self->has_accessions();
452 $dataref->{plots
} = join(",", @
{$self->plots()}) if $self->has_plots();
453 $dataref->{plants
} = join(",", @
{$self->plants()}) if $self->has_plants();
454 $dataref->{trials
} = join(",", @
{$self->trials()}) if $self->has_trials();
455 $dataref->{traits
} = join(",", @
{$self->traits()}) if $self->has_traits();
456 $dataref->{years
} = join(",", @
{$self->years()}) if $self->has_years();
457 $dataref->{breeding_programs
} = join(",", @
{$self->breeding_programs()}) if $self->has_breeding_programs();
458 $dataref->{genotyping_protocols
} = join(",", @
{$self->genotyping_protocols()}) if $self->has_genotyping_protocols();
459 $dataref->{trial_designs
} = join(",", @
{$self->trial_designs()}) if $self->has_trial_designs();
460 $dataref->{trial_types
} = join(",", @
{$self->trial_types()}) if $self->has_trial_types();
461 $dataref->{locations
} = join(",", @
{$self->locations()}) if $self->has_locations();
465 sub _get_source_dataref
{
467 my $source_type = shift;
471 $dataref->{$source_type} = $self->_get_dataref();
476 =head2 retrieve_genotypes()
478 Retrieves genotypes as a listref of hashrefs.
482 sub retrieve_genotypes
{
484 my $protocol_id = shift;
485 my $genotypeprop_hash_select = shift || ['DS'];
486 my $protocolprop_top_key_select = shift || [];
487 my $protocolprop_marker_hash_select = shift || [];
488 my $return_only_first_genotypeprop_for_stock = shift || 1;
489 my $chromosome_list = shift || [];
490 my $start_position = shift;
491 my $end_position = shift;
492 my $marker_name_list = shift || [];
494 my $genotypes_search = CXGN
::Genotype
::Search
->new(
495 bcs_schema
=> $self->schema(),
496 people_schema
=>$self->people_schema,
497 accession_list
=> $self->accessions(),
498 trial_list
=> $self->trials(),
499 protocol_id_list
=> [$protocol_id],
500 chromosome_list
=> $chromosome_list,
501 start_position
=> $start_position,
502 end_position
=> $end_position,
503 marker_name_list
=> $marker_name_list,
504 genotypeprop_hash_select
=>$genotypeprop_hash_select, #THESE ARE THE KEYS IN THE GENOTYPEPROP OBJECT
505 protocolprop_top_key_select
=>$protocolprop_top_key_select, #THESE ARE THE KEYS AT THE TOP LEVEL OF THE PROTOCOLPROP OBJECT
506 protocolprop_marker_hash_select
=>$protocolprop_marker_hash_select, #THESE ARE THE KEYS IN THE MARKERS OBJECT IN THE PROTOCOLPROP OBJECT
507 return_only_first_genotypeprop_for_stock
=>$return_only_first_genotypeprop_for_stock #FOR MEMORY REASONS TO LIMIT DATA
509 my ($total_count, $dataref) = $genotypes_search->get_genotype_info();
513 =head2 retrieve_phenotypes()
515 retrieves phenotypes as a listref of listrefs
519 sub retrieve_phenotypes
{
521 my $phenotypes_search = CXGN
::Phenotypes
::PhenotypeMatrix
->new(
522 search_type
=>'MaterializedViewTable',
523 bcs_schema
=>$self->schema(),
524 data_level
=>$self->data_level(),
525 trait_list
=>$self->traits(),
526 trial_list
=>$self->trials(),
527 accession_list
=>$self->accessions(),
528 exclude_phenotype_outlier
=>$self->exclude_phenotype_outlier
530 my @data = $phenotypes_search->get_phenotype_matrix();
534 =head2 retrieve_accessions()
536 retrieves accessions as a listref of listref [stock_id, uniquname]
540 sub retrieve_accessions
{
543 if ($self->has_accessions()) {
544 return $self->accessions();
547 my $criteria = $self->get_dataset_definition();
548 push @
$criteria, "accessions";
550 $accessions = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("accessions"));
552 return $accessions->{results
};
555 =head2 retrieve_plots()
557 Retrieves plots as a listref of listrefs.
564 if ($self->has_plots()) {
565 return $self->plots();
568 my $criteria = $self->get_dataset_definition();
569 push @
$criteria, "plots";
570 $plots = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("plots"));
572 return $plots->{results
};
575 =head2 retrieve_plants()
577 Retrieves plants as a listref of listrefs.
581 sub retrieve_plants
{
584 if ($self->has_plants()) {
585 return $self->plants();
588 my $criteria = $self->get_dataset_definition();
589 push @
$criteria, "plants";
590 $plants = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("plants"));
592 return $plants->{results
};
595 =head2 retrieve_trials()
597 retrieves trials as a listref of listrefs.
601 sub retrieve_trials
{
604 if ($self->has_trials()) {
605 return $self->trials();
608 my $criteria = $self->get_dataset_definition();
609 push @
$criteria, "trials";
610 $trials = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("trials"));
612 print STDERR
"TRIALS: ".Dumper
($trials);
613 return $trials->{results
};
616 =head2 retrieve_traits()
618 retrieves traits as a listref of listrefs.
622 sub retrieve_traits
{
625 if ($self->has_traits()) {
626 return $self->traits();
629 my $criteria = $self->get_dataset_definition();
630 push @
$criteria, "traits";
631 $traits = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("traits"));
633 return $traits->{results
};
637 =head2 retrieve_years()
639 retrieves years as a listref of listrefs
646 if ($self->has_years()) {
647 return $self->years();
650 my $criteria = $self->get_dataset_definition();
651 push @
$criteria, "years";
652 my $year_data = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("years"));
653 my $year_list = $year_data->{result
};
655 foreach my $y (@
$year_list) {
656 push @years, $y->[0];
662 =head2 retrieve_years()
664 retrieves years as a listref of listrefs
668 sub retrieve_locations
{
671 if ($self->has_locations()) {
672 return $self->locations();
675 my $criteria = $self->get_dataset_definition();
676 push @
$criteria, "locations";
677 my $location_data = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("locations"));
678 my $location_list = $location_data->{result
};
680 foreach my $y (@
$location_list) {
681 push @locations, $y->[0];
687 =head2 retrieve_breeding_programs
698 sub retrieve_breeding_programs
{
700 my @breeding_programs;
701 if ($self->has_breeding_programs()) {
702 return $self->breeding_programs();
705 my $criteria = $self->get_dataset_definition();
706 push @
$criteria, "breeding_programs";
707 my $breeding_program_data = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("breeding_programs"));
708 my $breeding_program_list = $breeding_program_data->{result
};
710 foreach my $y (@
$breeding_program_list) {
711 push @breeding_programs, $y->[0];
714 return \
@breeding_programs;
717 =head2 retrieve_genotyping_protocols
728 sub retrieve_genotyping_protocols
{
730 my @genotyping_protocols;
731 if ($self->has_genotyping_protocols()) {
732 return $self->genotyping_protocols();
735 my $criteria = $self->get_dataset_definition();
736 push @
$criteria, "genotyping_protocols";
737 my $breeding_program_data = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("genotyping_protocols"));
738 my $breeding_program_list = $breeding_program_data->{result
};
740 foreach my $y (@
$breeding_program_list) {
741 push @genotyping_protocols, $y->[0];
744 return \
@genotyping_protocols;
747 =head2 retrieve_trial_designs
758 sub retrieve_trial_designs
{
761 if ($self->has_trial_designs()) {
762 return $self->trial_designs();
765 my $criteria = $self->get_dataset_definition();
766 push @
$criteria, "trial_designs";
767 my $breeding_program_data = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("trial_designs"));
768 my $breeding_program_list = $breeding_program_data->{result
};
770 foreach my $y (@
$breeding_program_list) {
771 push @trial_designs, $y->[0];
774 return \
@trial_designs;
778 =head2 retrieve_trial_types
789 sub retrieve_trial_types
{
792 if ($self->has_trial_types()) {
793 return $self->trial_types();
796 my $criteria = $self->get_dataset_definition();
797 push @
$criteria, "trial_types";
798 my $breeding_program_data = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("trial_types"));
799 my $breeding_program_list = $breeding_program_data->{result
};
801 foreach my $y (@
$breeding_program_list) {
802 push @trial_types, $y->[0];
805 return \
@trial_types;
809 sub get_dataset_definition
{
813 if ($self->has_accessions()) {
814 push @criteria, "accessions";
816 if ($self->has_plots()) {
817 push @criteria, "plots";
819 if ($self->has_plants()) {
820 push @criteria, "plants";
822 if ($self->has_trials()) {
823 push @criteria, "trials";
825 if ($self->has_traits()) {
826 push @criteria, "traits";
828 if ($self->has_years()) {
829 push @criteria, "years";
831 if ($self->has_locations()) {
832 push @criteria, "locations";
834 if ($self->has_breeding_programs()) {
835 push @criteria, "breeding_programs";
837 if ($self->has_genotyping_protocols()) {
838 push @criteria, "genotyping_protocols";
840 if ($self->has_trial_types()) {
841 push @criteria, "trial_types";
843 if ($self->has_trial_designs()) {
844 push @criteria, "trial_designs";
854 Usage: $dataset->delete();
855 Desc: Deletes the specified dataset. Returns a string with an
856 error message is unsuccessful.
857 Ret: string if failure, undef if success
859 Side Effects: The function does not check for ownership of the dataset,
860 this has to be implemented in the calling function.
868 my $row = $self->people_schema()->resultset("SpDataset")->find( { sp_dataset_id
=> $self->sp_dataset_id() });
871 return "The specified dataset does not exist";
879 return "An error occurred, $@";