4 CXGN::Dataset - a class to easily query the database for breeding data
8 CXGN::Dataset can be used to flexibly define datasets for breeding applications. For example, a dataset can be defined using a list of germplasm, a list of trials, a list of years, etc, or a combination of the above. Once defined, it allows to easily obtain related phenotypes and genotypes and other data.
10 Datasets can be stored in the database and retrieved for later use.
12 Currently, there are three incarnations of CXGN::Dataset:
18 Unbuffered output of the queries
20 =item CXGN::Dataset::File
22 Writes results to files
24 =item CXGN::Dataset::Cache
26 Returns output like CXGN::Dataset, but uses a disk-cache for the response data
32 my $ds = CXGN::Dataset->new( people_schema => $p, schema => $s);
33 $ds->accessions([ 'a', 'b', 'c' ]);
34 my $trials = $ds->retrieve_trials();
35 my $sp_dataset_id = $ds->store();
37 my $restored_ds = CXGN::Dataset( people_schema => $p, schema => $s, sp_dataset_id => $sp_dataset_id );
38 my $years = $restored_ds->retrieve_years();
43 Lukas Mueller <lam87@cornell.edu>
51 package CXGN
::Dataset
;
54 use Moose
::Util
::TypeConstraints
;
57 use CXGN
::BreederSearch
;
58 use CXGN
::People
::Schema
;
59 use CXGN
::Phenotypes
::PhenotypeMatrix
;
60 use CXGN
::Genotype
::Search
;
62 =head2 people_schema()
64 accessor for CXGN::People::Schema database object
68 has
'people_schema' => (isa
=> 'CXGN::People::Schema', is
=> 'rw', required
=> 1 );
72 accessor for Bio::Chado::Schema database object
76 has
'schema' => ( isa
=> "Bio::Chado::Schema", is
=> 'rw', required
=> 1 );
78 =head2 sp_dataset_id()
80 accessor for sp_dataset primary key
85 has
'sp_dataset_id' => ( isa
=> 'Int',
87 predicate
=> 'has_sp_dataset_id',
92 accessor for the json-formatted data structure (as used for the backend storage)
96 has
'data' => ( isa
=> 'HashRef',
102 accessor for the name of this dataset
106 has
'name' => ( isa
=> 'Maybe[Str]',
112 accessor for the descrition of this dataset
116 has
'description' => ( isa
=> 'Maybe[Str]',
120 =head2 sp_person_id()
122 accessor for sp_person_id (owner of the dataset)
126 has
'sp_person_id' => ( isa
=> 'Maybe[Int]',
133 accessor for defining the accessions that are part of this dataset (ArrayRef).
137 has
'accessions' => ( isa
=> 'Maybe[ArrayRef]',
139 predicate
=> 'has_accessions',
144 accessor for defining the plots that are part of this dataset (ArrayRef).
148 has
'plots' => ( isa
=> 'Maybe[ArrayRef]',
150 predicate
=> 'has_plots',
156 accessor for defining the trials that are part of this dataset (ArrayRef).
161 has
'trials' => ( isa
=> 'Maybe[ArrayRef]',
163 predicate
=> 'has_trials',
171 has
'traits' => ( isa
=> 'Maybe[ArrayRef]',
173 predicate
=> 'has_traits',
181 has
'years' => ( isa
=> 'Maybe[ArrayRef]',
183 predicate
=> 'has_years',
186 =head2 breeding_programs()
190 has
'breeding_programs' => ( isa
=> 'Maybe[ArrayRef]',
192 predicate
=> 'has_breeding_programs',
193 default => sub { [] },
196 =head2 genotyping_protocols()
200 has
'genotyping_protocols' => ( isa
=> 'Maybe[ArrayRef]',
202 predicate
=> 'has_genotyping_protocols',
209 has
'trial_types' => ( isa
=> 'Maybe[ArrayRef]',
211 predicate
=> 'has_trial_types',
214 =head2 trial_designs()
218 has
'trial_designs' => ( isa
=> 'Maybe[ArrayRef]',
220 predicate
=> 'has_trial_designs',
227 has
'locations' => ( isa
=> 'Maybe[ArrayRef]',
229 predicate
=> 'has_locations',
233 has
'category_order' => ( isa
=> 'Maybe[ArrayRef]',
235 predicate
=> 'has_category_order',
242 has
'is_live' => ( isa
=> 'Bool',
252 has
'data_level' => ( isa
=> 'String',
254 isa
=> enum
([qw
[ plot plant subplot
]]),
258 =head2 exclude_phenotype_outlier()
262 has
'exclude_phenotype_outlier' => (
268 has
'breeder_search' => (isa
=> 'CXGN::BreederSearch', is
=> 'rw');
274 print STDERR
"Processing dataset_id ".$self->sp_dataset_id()."\n";
275 my $bs = CXGN
::BreederSearch
->new(dbh
=> $self->schema->storage->dbh());
276 $self->breeder_search($bs);
278 if ($self->has_sp_dataset_id()) {
279 my $row = $self->people_schema()->resultset("SpDataset")->find({ sp_dataset_id
=> $self->sp_dataset_id() });
280 if (!$row) { die "The dataset with id ".$self->sp_dataset_id()." does not exist"; }
281 my $dataset = JSON
::Any
->decode($row->dataset());
282 $self->data($dataset);
283 $self->name($row->name());
284 $self->description($row->description());
285 $self->sp_person_id($row->sp_person_id());
286 $self->accessions($dataset->{categories
}->{accessions
});
287 $self->plots($dataset->{categories
}->{plots
});
288 $self->trials($dataset->{categories
}->{trials
});
289 $self->traits($dataset->{categories
}->{traits
});
290 $self->years($dataset->{categories
}->{years
});
291 $self->locations($dataset->{categories
}->{locations
});
292 $self->breeding_programs($dataset->{categories
}->{breeding_programs
});
293 $self->genotyping_protocols($dataset->{categories
}->{genotyping_protocols
});
294 $self->trial_designs($dataset->{categories
}->{trial_designs
});
295 $self->trial_types($dataset->{categories
}->{trial_types
});
296 $self->category_order($dataset->{category_order
});
297 $self->is_live($dataset->{is_live
});
301 else { print STDERR
"Creating empty dataset object\n"; }
309 =head2 datasets_by_user()
314 sub get_datasets_by_user
{
316 my $people_schema = shift;
317 my $sp_person_id = shift;
319 my $rs = $people_schema->resultset("SpDataset")->search( { sp_person_id
=> $sp_person_id });
322 while (my $row = $rs->next()) {
323 push @datasets, [ $row->sp_dataset_id(), $row->name(), $row->description() ];
329 =head2 exists_dataset_name
340 sub exists_dataset_name
{
342 my $people_schema = shift;
345 my $rs = $people_schema->resultset("SpDataset")->search( { name
=> { -ilike
=> $name}});
347 if ($rs->count() > 0) {
365 my $dataref = $self->get_dataset_data();
367 my $json = JSON
::Any
->encode($dataref);
369 my $data = { name
=> $self->name(),
370 description
=> $self->description(),
371 sp_person_id
=> $self->sp_person_id(),
377 print STDERR
"dataset_id = ".$self->sp_dataset_id()."\n";
378 if (!$self->has_sp_dataset_id()) {
379 print STDERR
"Creating new dataset row... ".$self->sp_dataset_id()."\n";
380 my $row = $self->people_schema()->resultset("SpDataset")->create($data);
381 $self->sp_dataset_id($row->sp_dataset_id());
382 return $row->sp_dataset_id();
385 print STDERR
"Updating dataset row ".$self->sp_dataset_id()."\n";
386 my $row = $self->people_schema()->resultset("SpDataset")->find( { sp_dataset_id
=> $self->sp_dataset_id() });
388 $row->name($self->name());
389 $row->description($self->description());
390 $row->dataset($json);
391 $row->sp_person_id($self->sp_person_id());
393 return $row->sp_dataset_id();
396 print STDERR
"Weird... has ".$self->sp_dataset_id()." but no data in db\n";
401 sub get_dataset_data
{
404 $dataref->{categories
}->{accessions
} = $self->accessions() if $self->has_accessions();
405 $dataref->{categories
}->{plots
} = $self->plots() if $self->has_plots();
406 $dataref->{categories
}->{trials
} = $self->trials() if $self->has_trials();
407 $dataref->{categories
}->{traits
} = $self->traits() if $self->has_traits();
408 $dataref->{categories
}->{years
} = $self->years() if $self->has_years();
409 $dataref->{categories
}->{breeding_programs
} = $self->breeding_programs() if $self->has_breeding_programs();
410 $dataref->{categories
}->{genotyping_protocols
} = $self->genotyping_protocols() if $self->has_genotyping_protocols();
411 $dataref->{categories
}->{trial_designs
} = $self->trial_designs() if $self->has_trial_designs();
412 $dataref->{categories
}->{trial_types
} = $self->trial_types() if $self->has_trial_types();
413 $dataref->{categories
}->{locations
} = $self->locations() if $self->has_locations();
414 $dataref->{category_order
} = $self->category_order();
422 $dataref->{accessions
} = join(",", @
{$self->accessions()}) if $self->has_accessions();
423 $dataref->{plots
} = join(",", @
{$self->plots()}) if $self->has_plots();
424 $dataref->{trials
} = join(",", @
{$self->trials()}) if $self->has_trials();
425 $dataref->{traits
} = join(",", @
{$self->traits()}) if $self->has_traits();
426 $dataref->{years
} = join(",", @
{$self->years()}) if $self->has_years();
427 $dataref->{breeding_programs
} = join(",", @
{$self->breeding_programs()}) if $self->has_breeding_programs();
428 $dataref->{genotyping_protocols
} = join(",", @
{$self->genotyping_protocols()}) if $self->has_genotyping_protocols();
429 $dataref->{trial_designs
} = join(",", @
{$self->trial_designs()}) if $self->has_trial_designs();
430 $dataref->{trial_types
} = join(",", @
{$self->trial_types()}) if $self->has_trial_types();
431 $dataref->{locations
} = join(",", @
{$self->locations()}) if $self->has_locations();
435 sub _get_source_dataref
{
437 my $source_type = shift;
441 $dataref->{$source_type} = $self->_get_dataref();
446 =head2 retrieve_genotypes()
448 Retrieves genotypes as a listref of hashrefs.
452 sub retrieve_genotypes
{
454 my $protocol_id = shift;
456 my $genotypes_search = CXGN
::Genotype
::Search
->new(
457 bcs_schema
=> $self->schema(),
458 accession_list
=> $self->accessions(),
459 trial_list
=> $self->trials(),
460 protocol_id_list
=> [$protocol_id]
462 my ($total_count, $dataref) = $genotypes_search->get_genotype_info();
466 =head2 retrieve_phenotypes()
468 retrieves phenotypes as a listref of listrefs
472 sub retrieve_phenotypes
{
474 my $phenotypes_search = CXGN
::Phenotypes
::PhenotypeMatrix
->new(
475 search_type
=>'MaterializedViewTable',
476 bcs_schema
=>$self->schema(),
477 data_level
=>$self->data_level(),
478 trait_list
=>$self->traits(),
479 trial_list
=>$self->trials(),
480 accession_list
=>$self->accessions(),
481 exclude_phenotype_outlier
=>$self->exclude_phenotype_outlier
483 my @data = $phenotypes_search->get_phenotype_matrix();
487 =head2 retrieve_accessions()
489 retrieves accessions as a listref of listref [stock_id, uniquname]
493 sub retrieve_accessions
{
496 if ($self->has_accessions()) {
497 return $self->accessions();
500 my $criteria = $self->get_dataset_definition();
501 push @
$criteria, "accessions";
503 $accessions = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("accessions"));
505 return $accessions->{results
};
508 =head2 retrieve_plots()
510 Retrieves plots as a listref of listrefs.
517 if ($self->has_plots()) {
518 return $self->plots();
521 my $criteria = $self->get_dataset_definition();
522 push @
$criteria, "plots";
523 $plots = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("plots"));
525 return $plots->{results
};
528 =head2 retrieve_trials()
530 retrieves trials as a listref of listrefs.
534 sub retrieve_trials
{
537 if ($self->has_trials()) {
538 return $self->trials();
541 my $criteria = $self->get_dataset_definition();
542 push @
$criteria, "trials";
543 $trials = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("trials"));
545 print STDERR
"TRIALS: ".Dumper
($trials);
546 return $trials->{results
};
549 =head2 retrieve_traits()
551 retrieves traits as a listref of listrefs.
555 sub retrieve_traits
{
558 if ($self->has_traits()) {
559 return $self->traits();
562 my $criteria = $self->get_dataset_definition();
563 push @
$criteria, "traits";
564 $traits = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("traits"));
566 return $traits->{results
};
570 =head2 retrieve_years()
572 retrieves years as a listref of listrefs
579 if ($self->has_years()) {
580 return $self->years();
583 my $criteria = $self->get_dataset_definition();
584 push @
$criteria, "years";
585 my $year_data = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("years"));
586 my $year_list = $year_data->{result
};
588 foreach my $y (@
$year_list) {
589 push @years, $y->[0];
595 =head2 retrieve_years()
597 retrieves years as a listref of listrefs
601 sub retrieve_locations
{
604 if ($self->has_locations()) {
605 return $self->locations();
608 my $criteria = $self->get_dataset_definition();
609 push @
$criteria, "locations";
610 my $location_data = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("locations"));
611 my $location_list = $location_data->{result
};
613 foreach my $y (@
$location_list) {
614 push @locations, $y->[0];
620 =head2 retrieve_breeding_programs
631 sub retrieve_breeding_programs
{
633 my @breeding_programs;
634 if ($self->has_breeding_programs()) {
635 return $self->breeding_programs();
638 my $criteria = $self->get_dataset_definition();
639 push @
$criteria, "breeding_programs";
640 my $breeding_program_data = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("breeding_programs"));
641 my $breeding_program_list = $breeding_program_data->{result
};
643 foreach my $y (@
$breeding_program_list) {
644 push @breeding_programs, $y->[0];
647 return \
@breeding_programs;
650 =head2 retrieve_genotyping_protocols
661 sub retrieve_genotyping_protocols
{
663 my @genotyping_protocols;
664 if ($self->has_genotyping_protocols()) {
665 return $self->genotyping_protocols();
668 my $criteria = $self->get_dataset_definition();
669 push @
$criteria, "genotyping_protocols";
670 my $breeding_program_data = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("genotyping_protocols"));
671 my $breeding_program_list = $breeding_program_data->{result
};
673 foreach my $y (@
$breeding_program_list) {
674 push @genotyping_protocols, $y->[0];
677 return \
@genotyping_protocols;
680 =head2 retrieve_trial_designs
691 sub retrieve_trial_designs
{
694 if ($self->has_trial_designs()) {
695 return $self->trial_designs();
698 my $criteria = $self->get_dataset_definition();
699 push @
$criteria, "trial_designs";
700 my $breeding_program_data = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("trial_designs"));
701 my $breeding_program_list = $breeding_program_data->{result
};
703 foreach my $y (@
$breeding_program_list) {
704 push @trial_designs, $y->[0];
707 return \
@trial_designs;
711 =head2 retrieve_trial_types
722 sub retrieve_trial_types
{
725 if ($self->has_trial_types()) {
726 return $self->trial_types();
729 my $criteria = $self->get_dataset_definition();
730 push @
$criteria, "trial_types";
731 my $breeding_program_data = $self->breeder_search()->metadata_query($criteria, $self->_get_source_dataref("trial_types"));
732 my $breeding_program_list = $breeding_program_data->{result
};
734 foreach my $y (@
$breeding_program_list) {
735 push @trial_types, $y->[0];
738 return \
@trial_types;
742 sub get_dataset_definition
{
746 if ($self->has_accessions()) {
747 push @criteria, "accessions";
749 if ($self->has_plots()) {
750 push @criteria, "plots";
752 if ($self->has_trials()) {
753 push @criteria, "trials";
755 if ($self->has_traits()) {
756 push @criteria, "traits";
758 if ($self->has_years()) {
759 push @criteria, "years";
761 if ($self->has_locations()) {
762 push @criteria, "locations";
764 if ($self->has_breeding_programs()) {
765 push @criteria, "breeding_programs";
767 if ($self->has_genotyping_protocols()) {
768 push @criteria, "genotyping_protocols";
770 if ($self->has_trial_types()) {
771 push @criteria, "trial_types";
773 if ($self->has_trial_designs()) {
774 push @criteria, "trial_designs";
784 Usage: $dataset->delete();
785 Desc: Deletes the specified dataset. Returns a string with an
786 error message is unsuccessful.
787 Ret: string if failure, undef if success
789 Side Effects: The function does not check for ownership of the dataset,
790 this has to be implemented in the calling function.
798 my $row = $self->people_schema()->resultset("SpDataset")->find( { sp_dataset_id
=> $self->sp_dataset_id() });
801 return "The specified dataset does not exist";
809 return "An error occurred, $@";