3 CXGN::BreederSearch - class for retrieving breeder information for the breeder search wizard
7 Lukas Mueller <lam87@cornell.edu>
8 Aimin Yan <ay247@cornell.edu>
14 package CXGN
::BreederSearch
;
31 Usage: my %info = $bs->metadata_query($criteria_list, $dataref, $queryref);
33 Ret: returns a hash with a key called results that contains
34 a listref of listrefs specifying the matching list with ids
36 Args: criteria_list: a comma separated string called a criteria_list,
37 listing all the criteria that need to be applied. Possible
38 criteria are trials, years, traits, and locations. The last
39 criteria in the list is the return type.
40 dataref: The dataref is a hashref of hashrefs. The first key
41 is the target of the transformation, and the second is the
42 source type of the transformation, containing comma separated
43 values of the source type.
44 queryref: same structure as dataref, but instead of storing ids it stores a
45 1 if user requested intersect, or 0 for default union
46 Side Effects: will run refresh_matviews() if matviews aren't already populated
54 my $criteria_list = shift;
58 print STDERR
"criteria_list=" . Dumper
($criteria_list);
59 print STDERR
"dataref=" . Dumper
($dataref);
60 print STDERR
"queryref=" . Dumper
($queryref);
62 # Check if matviews are populated, and run refresh if they aren't. Which, as of postgres 9.5, will be the case when our databases are loaded from a dump. This should no longer be necessary once this bug is fixed in newer postgres versions
63 my ($status, %response_hash);
65 my $populated_query = "select * from materialized_phenoview limit 1";
66 my $sth = $self->dbh->prepare($populated_query);
68 } catch
{ #if test query fails because views aren't populated
69 print STDERR
"Using basic refresh to populate views . . .\n";
70 $status = $self->refresh_matviews($c->config->{dbhost
}, $c->config->{dbname
}, $c->config->{dbuser
}, $c->config->{dbpass
}, 'basic');
71 %response_hash = %$status;
74 if (%response_hash && $response_hash{'message'} eq 'Wizard update completed!') {
75 print STDERR
"Populated views, now proceeding with query . . . .\n";
76 } elsif (%response_hash && $response_hash{'message'} eq 'Wizard update initiated.') {
77 return { error
=> "The search wizard is temporarily unavailable while database indexes are being repopulated. Please try again later. Depending on the size of the database, it will be ready within a few seconds to an hour."};
78 } elsif (%response_hash && $response_hash{'error'}) {
79 return { error
=> $response_hash{'error'} };
82 my $target_table = $criteria_list->[-1];
83 print STDERR
"target_table=". $target_table . "\n";
84 my $target = $target_table;
87 my $select = "SELECT ".$target."_id, ".$target."_name ";
88 my $group = "GROUP BY ".$target."_id, ".$target."_name ";
91 if (!$dataref->{"$target_table"}) {
92 my $from = "FROM public.". $target_table;
93 my $where = " WHERE ".$target."_id IS NOT NULL";
94 $full_query = $select . $from . $where;
98 foreach my $category (@
$criteria_list) {
100 if ($dataref->{$criteria_list->[-1]}->{$category}) {
102 my @categories = ($target_table, $category);
103 @categories = sort @categories;
104 my $from = "FROM public.". $categories[0] ."x". $categories[1] . " JOIN public." . $target_table . " USING(" . $target."_id) ";
105 my $criterion = $category;
106 $criterion =~ s/s$//;
107 my $intersect = $queryref->{$criteria_list->[-1]}->{$category};
111 my @ids = split(/,/, $dataref->{$criteria_list->[-1]}->{$category});
112 foreach my $id (@ids) {
113 my $where = "WHERE ". $criterion. "_id IN (". $id .") ";
114 my $statement = $select . $from . $where . $group;
115 push @parts, $statement;
117 $query = join (" INTERSECT ", @parts);
118 push @queries, $query;
121 my $where = "WHERE ". $criterion. "_id IN (" . $dataref->{$criteria_list->[-1]}->{$category} . ") ";
122 $query = $select . $from . $where . $group;
123 push @queries, $query;
127 $full_query = join (" INTERSECT ", @queries);
129 $full_query .= " ORDER BY 2";
130 print STDERR
"QUERY: $full_query\n";
131 $h = $self->dbh->prepare($full_query);
135 while (my ($id, $name) = $h->fetchrow_array()) {
136 push @results, [ $id, $name ];
139 if (@results >= 10_000
) {
140 return { error
=> scalar(@results).' matches. Too many results to display' };
142 elsif (@results < 1) {
143 return { error
=> scalar(@results).' matches. No results to display' };
146 return { results
=> \
@results };
150 =head2 avg_phenotypes_query
152 parameters: trait_id, trial_id, allow_missing
154 returns: values, the avg pheno value of each accession in a trial for the given trait, and column_names, an array of the trait names
160 sub avg_phenotypes_query
{
162 my $trial_id = shift;
163 my $trait_ids = shift;
165 my $allow_missing = shift;
166 my $reference_accession = shift;
169 my $select = "SELECT table0.accession_id, table0.accession_name";
170 my $from = " FROM (SELECT accession_id, accession_name FROM materialized_phenoview WHERE trial_id = $trial_id GROUP BY 1,2) AS table0";
171 for (my $i = 1; $i <= scalar @
$trait_ids; $i++) {
172 $select .= ", ROUND( CAST(table$i.trait$i AS NUMERIC), 2)";
173 $from .= " JOIN (SELECT accession_id, accession_name, AVG(phenotype_value::REAL) AS trait$i FROM materialized_phenoview WHERE trial_id = $trial_id AND trait_id = ? GROUP BY 1,2) AS table$i USING (accession_id)";
175 my $query = $select . $from . " ORDER BY 2";
176 if ($allow_missing eq 'true') { $query =~ s/JOIN/FULL OUTER JOIN/g; }
178 print STDERR
"QUERY: $query\n";
180 my $h = $self->dbh->prepare($query);
181 $h->execute(@
$trait_ids);
183 my @weights = @
$weights;
184 my (@raw_avg_values, @reference_values, @rows_to_scale, @weighted_values);
186 if ($reference_accession) {
188 while (my @row = $h->fetchrow_array()) {
189 push @rows_to_scale, @row;
190 my ($id, $name, @avg_values) = @row;
191 if ($id == $reference_accession) { @reference_values = @avg_values; }
194 print STDERR
"reference accession id = $reference_accession and reference values = @reference_values\n";
196 $h->execute(@
$trait_ids);
197 while (my ($id, $name, @avg_values) = $h->fetchrow_array()) {
199 my @scaled_values = map {sprintf("%.2f", $avg_values[$_] / $reference_values[$_])} 0..$#avg_values;
200 my @scaled_and_weighted = map {sprintf("%.2f", $scaled_values[$_] * $weights[$_])} 0..$#scaled_values;
201 unshift @scaled_values, '<a href="/stock/'.$id.'/view">'.$name.'</a>';
202 unshift @scaled_and_weighted, '<a href="/stock/'.$id.'/view">'.$name.'</a>';
204 push @raw_avg_values, [@scaled_values];
207 map { $sum += $_ } @scaled_and_weighted;
208 my $rounded_sum = sprintf("%.2f", $sum);
209 push @scaled_and_weighted, $rounded_sum;
210 push @weighted_values, [@scaled_and_weighted];
215 while (my ($id, $name, @avg_values) = $h->fetchrow_array()) {
217 my @values_to_weight = @avg_values;
218 unshift @avg_values, '<a href="/stock/'.$id.'/view">'.$name.'</a>';
219 push @raw_avg_values, [@avg_values];
221 @values_to_weight = map {$values_to_weight[$_] * $weights[$_]} 0..$#values_to_weight;
223 map { $sum += $_ } @values_to_weight;
224 unshift @values_to_weight, '<a href="/stock/'.$id.'/view">'.$name.'</a>';
225 my $rounded_sum = sprintf("%.2f", $sum);
226 push @values_to_weight, $rounded_sum;
227 push @weighted_values, [@values_to_weight];
232 my @weighted_values2 = sort { $b->[-1] <=> $a->[-1] } @weighted_values;
233 my @weighted_values3;
234 for (my $i = 0; $i < scalar @weighted_values2; $i++ ) {
235 my $temp_array = $weighted_values2[$i];
236 my @temp_array = @
$temp_array;
237 push @temp_array, $i+1;
238 push @weighted_values3, [@temp_array];
241 print STDERR
"avg_phenotypes: ".Dumper
(@raw_avg_values);
242 print STDERR
"avg_phenotypes: ".Dumper
(@weighted_values3);
245 raw_avg_values
=> \
@raw_avg_values,
246 weighted_values
=> \
@weighted_values3
251 =head2 refresh_matviews
253 parameters: string to specify desired refresh type, basic or concurrent. defaults to concurrent
255 returns: message detailing success or error
257 Side Effects: Refreshes materialized views
261 sub refresh_matviews
{
268 my $refresh_type = shift || 'concurrent';
269 my $refresh_finished = 0;
272 my $q = "SELECT currently_refreshing FROM public.matviews WHERE mv_id=?";
273 my $h = $self->dbh->prepare($q);
276 my $refreshing = $h->fetchrow_array();
279 return { error
=> 'Wizard update already in progress . . . ' };
283 my $dbh = $self->dbh();
284 if ($refresh_type eq 'concurrent') {
285 #print STDERR "Using CXGN::Tools::Run to run perl bin/refresh_matviews.pl -H $dbhost -D $dbname -U $dbuser -P $dbpass -c";
286 $async_refresh = CXGN
::Tools
::Run
->run_async("perl bin/refresh_matviews.pl -H $dbhost -D $dbname -U $dbuser -P $dbpass -c");
288 print STDERR
"Using CXGN::Tools::Run to run perl bin/refresh_matviews.pl -H $dbhost -D $dbname -U $dbuser -P $dbpass";
289 $async_refresh = CXGN
::Tools
::Run
->run_async("perl bin/refresh_matviews.pl -H $dbhost -D $dbname -U $dbuser -P $dbpass");
292 for (my $i = 1; $i < 10; $i++) {
294 if ($async_refresh->alive) {
297 $refresh_finished = 1;
301 if ($refresh_finished) {
302 return { message
=> 'Wizard update completed!' };
304 return { message
=> 'Wizard update initiated.' };
307 print STDERR
'Error initiating wizard update.' . $@
. "\n";
308 return { error
=> 'Error initiating wizard update.' . $@
};
313 =head2 matviews_status
315 Desc: checks tracking table to see if materialized views are updating, and if not, when they were last updated.
319 returns: refreshing message or timestamp
325 sub matviews_status
{
327 my $q = "SELECT currently_refreshing, last_refresh FROM public.matviews WHERE mv_id=?";
328 my $h = $self->dbh->prepare($q);
331 my ($refreshing, $timestamp) = $h->fetchrow_array();
334 print STDERR
"Wizard is already refreshing, current status: $refreshing \n";
335 return { refreshing
=> "<p id='wizard_status'>Wizard update in progress . . . </p>"};
338 print STDERR
"materialized fullview last updated $timestamp\n";
339 return { timestamp
=> "<p id='wizard_status'>Wizard last updated: $timestamp</p>" };
343 sub get_phenotype_info
{
345 my $accession_sql = shift;
346 my $trial_sql = shift;
347 my $trait_sql = shift;
349 print STDERR
"GET_PHENOTYPE_INFO: $accession_sql - $trial_sql - $trait_sql \n\n";
351 my $rep_type_id = $self->get_stockprop_type_id("replicate");
352 my $block_number_type_id = $self -> get_stockprop_type_id
("block");
353 my $year_type_id = $self->get_projectprop_type_id("project year");
354 my $plot_type_id = $self->get_stock_type_id("plot");
355 my $plant_type_id = $self->get_stock_type_id("plant");
356 my $accession_type_id = $self->get_stock_type_id("accession");
358 my @where_clause = ();
359 if ($accession_sql) { push @where_clause, "stock.stock_id in ($accession_sql)"; }
360 if ($trial_sql) { push @where_clause, "project.project_id in ($trial_sql)"; }
361 if ($trait_sql) { push @where_clause, "cvterm.cvterm_id in ($trait_sql)"; }
363 my $where_clause = "";
365 if (@where_clause>0) {
366 $where_clause .= $rep_type_id ?
"WHERE (stockprop.type_id = $rep_type_id OR stockprop.type_id IS NULL) " : "WHERE stockprop.type_id IS NULL";
367 $where_clause .= "AND (plot.type_id = $plot_type_id OR plot.type_id = $plant_type_id) AND stock.type_id = $accession_type_id";
368 $where_clause .= $block_number_type_id ?
" AND (block_number.type_id = $block_number_type_id OR block_number.type_id IS NULL)" : " AND block_number.type_id IS NULL";
369 $where_clause .= $year_type_id ?
" AND projectprop.type_id = $year_type_id" :"" ;
370 $where_clause .= " AND " . (join (" AND " , @where_clause));
372 #$where_clause = "where (stockprop.type_id=$rep_type_id or stockprop.type_id IS NULL) AND (block_number.type_id=$block_number_type_id or block_number.type_id IS NULL) AND ".(join (" and ", @where_clause));
375 my $order_clause = " order by project.name, string_to_array(plot_number.value, '.')::int[]";
376 my $q = "SELECT projectprop.value, project.name, stock.uniquename, nd_geolocation.description, cvterm.name, phenotype.value, plot.uniquename, db.name, db.name || ':' || dbxref.accession AS accession, stockprop.value, block_number.value, cvterm.cvterm_id, project.project_id, nd_geolocation.nd_geolocation_id, stock.stock_id, plot.stock_id, phenotype.uniquename
377 FROM stock as plot JOIN stock_relationship ON (plot.stock_id=subject_id)
378 JOIN stock ON (object_id=stock.stock_id)
379 LEFT JOIN stockprop ON (plot.stock_id=stockprop.stock_id)
380 LEFT JOIN stockprop AS block_number ON (plot.stock_id=block_number.stock_id)
381 LEFT JOIN stockprop AS plot_number ON (plot.stock_id=plot_number.stock_id) AND plot_number.type_id = (SELECT cvterm_id from cvterm where cvterm.name = 'plot number')
382 JOIN nd_experiment_stock ON(nd_experiment_stock.stock_id=plot.stock_id)
383 JOIN nd_experiment ON (nd_experiment_stock.nd_experiment_id=nd_experiment.nd_experiment_id)
384 JOIN nd_geolocation USING(nd_geolocation_id)
385 JOIN nd_experiment_phenotype ON (nd_experiment_phenotype.nd_experiment_id=nd_experiment.nd_experiment_id)
386 JOIN phenotype USING(phenotype_id) JOIN cvterm ON (phenotype.cvalue_id=cvterm.cvterm_id)
388 JOIN dbxref ON (cvterm.dbxref_id = dbxref.dbxref_id)
390 JOIN nd_experiment_project ON (nd_experiment_project.nd_experiment_id=nd_experiment.nd_experiment_id)
391 JOIN project USING(project_id)
392 JOIN projectprop USING(project_id)
396 #print STDERR "QUERY: $q\n\n";
397 my $h = $self->dbh()->prepare($q);
401 while (my ($year, $project_name, $stock_name, $location, $trait, $value, $plot_name, $cv_name, $cvterm_accession, $rep, $block_number, $trait_id, $project_id, $location_id, $stock_id, $plot_id, $phenotype_uniquename) = $h->fetchrow_array()) {
402 push @
$result, [ $year, $project_name, $stock_name, $location, $trait, $value, $plot_name, $cv_name, $cvterm_accession, $rep, $block_number, $trait_id, $project_id, $location_id, $stock_id, $plot_id, $phenotype_uniquename ];
405 #print STDERR Dumper $result;
406 print STDERR
"QUERY returned ".scalar(@
$result)." rows.\n";
410 sub get_phenotype_info_matrix
{
412 my $accession_sql = shift;
413 my $trial_sql = shift;
414 my $trait_sql = shift;
416 my $data = $self->get_phenotype_info($accession_sql, $trial_sql, $trait_sql);
417 #data contains [$year, $project_name, $stock_name, $location, $trait, $value, $plot_name, $cv_name, $cvterm_accession, $rep, $block_number, $trait_id, $project_id, $location_id, $stock_id, $plot_id]
422 foreach my $d (@
$data) {
423 print STDERR
"PRINTING TRAIT DATA FOR TERM " . $d->[4] . "\n\n";
424 my $cvterm = $d->[4]."|".$d->[8];
425 my $trait_data = $d->[5];
427 $plot_data{$plot}->{$cvterm} = $trait_data;
434 # generate header line
436 my @sorted_traits = sort keys(%traits);
437 foreach my $trait (@sorted_traits) {
438 $line .= "\t".$trait; # first header has to be empty (plot name column)
442 # dump phenotypic values
445 foreach my $plot (sort keys (%plot_data)) {
448 foreach my $trait (@sorted_traits) {
449 my $tab = $plot_data{$plot}->{$trait}; # ? "\t".$plot_data{$plot}->{$trait} : "\t";
450 $line .= defined($tab) ?
"\t".$tab : "\t";
459 sub get_extended_phenotype_info_matrix
{
461 my $accession_sql = shift;
462 my $trial_sql = shift;
463 my $trait_sql = shift;
464 my $include_timestamp = shift // 0;
466 my $data = $self->get_phenotype_info($accession_sql, $trial_sql, $trait_sql);
467 #data contains [$year, $project_name, $stock_name, $location, $trait, $value, $plot_name, $cv_name, $cvterm_accession, $rep, $block_number, $trait_id, $project_id, $location_id, $stock_id, $plot_id, $phenotype_uniquename]
472 print STDERR
"No of lines retrieved: ".scalar(@
$data)."\n";
473 foreach my $d (@
$data) {
475 my ($year, $project_name, $stock_name, $location, $trait, $trait_data, $plot, $cv_name, $cvterm_accession, $rep, $block_number, $trait_id, $project_id, $location_id, $stock_id, $plot_id, $phenotype_uniquename) = @
$d;
477 my $cvterm = $d->[4]."|".$d->[8];
478 if ($include_timestamp) {
479 my ($p1, $p2) = split /date: /, $phenotype_uniquename;
480 my ($timestamp, $p3) = split / operator/, $p2;
481 if( $timestamp =~ m/(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})(\S)(\d{4})/) {
482 $plot_data{$plot}->{$cvterm} = "$trait_data,$timestamp";
484 $plot_data{$plot}->{$cvterm} = $trait_data;
487 $plot_data{$plot}->{$cvterm} = $trait_data;
490 if (!defined($rep)) { $rep = ""; }
491 $plot_data{$plot}->{metadata
} = {
493 studyName
=> $project_name,
494 germplasmName
=> $stock_name,
495 locationName
=> $location,
496 blockNumber
=> $block_number,
499 trait_data
=> $trait_data,
501 cvterm_id
=> $trait_id,
502 studyDbId
=> $project_id,
503 locationDbId
=> $location_id,
504 germplasmDbId
=> $stock_id,
509 #print STDERR Dumper \%plot_data;
512 my $line = join "\t", qw
| studyYear studyDbId studyName locationDbId locationName germplasmDbId germplasmName plotDbId plotName rep blockNumber
|;
514 # generate header line
516 my @sorted_traits = sort keys(%traits);
517 foreach my $trait (@sorted_traits) {
518 $line .= "\t".$trait;
522 # dump phenotypic values
526 my @unique_plot_list = ();
527 foreach my $d (keys \
%plot_data) {
528 push @unique_plot_list, $d;
530 #print STDERR Dumper \@unique_plot_list;
532 foreach my $p (@unique_plot_list) {
533 #$line = join "\t", map { $plot_data{$p}->{metadata}->{$_} } ( "year", "trial_name", "location", "accession", "plot", "rep", "block_number" );
534 $line = join "\t", map { $plot_data{$p}->{metadata
}->{$_} } ( "year", "studyDbId", "studyName", "locationDbId", "locationName", "germplasmDbId", "germplasmName", "plotDbId", "plotName", "rep", "blockNumber" );
536 #print STDERR "Adding line for plot $p\n";
537 foreach my $trait (@sorted_traits) {
538 my $tab = $plot_data{$p}->{$trait};
539 $line .= defined($tab) ?
"\t".$tab : "\t";
549 =head2 get_genotype_info
551 parameters: comma-separated lists of accession, trial, and trait IDs. May be empty.
553 returns: an array with genotype information
557 sub get_genotype_info
{
560 my $accession_idref = shift;
561 my $protocol_id = shift;
562 my $snp_genotype_id = shift || '76434';
563 my @accession_ids = @
$accession_idref;
564 my ($q, @result, $protocol_name);
566 if (@accession_ids) {
567 $q = "SELECT name, uniquename, value FROM (SELECT nd_protocol.name, stock.uniquename, genotypeprop.value, row_number() over (partition by stock.uniquename order by genotypeprop.genotype_id) as rownum from genotypeprop join nd_experiment_genotype USING (genotype_id) JOIN nd_experiment_protocol USING(nd_experiment_id) JOIN nd_protocol USING(nd_protocol_id) JOIN nd_experiment_stock USING(nd_experiment_id) JOIN stock USING(stock_id) WHERE genotypeprop.type_id = ? AND stock.stock_id in (@{[join',', ('?') x @accession_ids]}) AND nd_experiment_protocol.nd_protocol_id=?) tmp WHERE rownum <2";
569 print STDERR
"QUERY: $q\n\n";
571 my $h = $self->dbh()->prepare($q);
572 $h->execute($snp_genotype_id, @accession_ids,$protocol_id);
575 while (my($name,$uniquename,$genotype_string) = $h->fetchrow_array()) {
576 push @result, [ $uniquename, $genotype_string ];
577 $protocol_name = $name;
581 protocol_name
=> $protocol_name,
582 genotypes
=> \
@result
590 my $q = "SELECT projectprop.type_id FROM projectprop JOIN cvterm on (projectprop.type_id=cvterm.cvterm_id) WHERE cvterm.name='$term'";
591 my $h = $self->dbh->prepare($q);
593 my ($type_id) = $h->fetchrow_array();
598 sub get_stock_type_id
{
601 my $q = "SELECT stock.type_id FROM stock JOIN cvterm on (stock.type_id=cvterm.cvterm_id) WHERE cvterm.name='$term'";
602 my $h = $self->dbh->prepare($q);
604 my ($type_id) = $h->fetchrow_array();
608 sub get_stockprop_type_id
{
611 my $q = "SELECT stockprop.type_id FROM stockprop JOIN cvterm on (stockprop.type_id=cvterm.cvterm_id) WHERE cvterm.name=?";
612 my $h = $self->dbh->prepare($q);
614 my ($type_id) = $h->fetchrow_array();
618 sub get_projectprop_type_id
{
621 my $q = "SELECT projectprop.type_id FROM projectprop JOIN cvterm ON (projectprop.type_id=cvterm.cvterm_id) WHERE cvterm.name=?";
622 my $h = $self->dbh->prepare($q);
624 my ($type_id) = $h->fetchrow_array();