3 CXGN::BreederSearch - class for retrieving breeder information for the breeder search wizard
7 Lukas Mueller <lam87@cornell.edu>
8 Aimin Yan <ay247@cornell.edu>
14 package CXGN
::BreederSearch
;
31 Usage: my %info = $bs->metadata_query($criteria_list, $dataref, $queryref);
33 Ret: returns a hash with a key called results that contains
34 a listref of listrefs specifying the matching list with ids
36 Args: criteria_list: a comma separated string called a criteria_list,
37 listing all the criteria that need to be applied. Possible
38 criteria are trials, years, traits, and locations. The last
39 criteria in the list is the return type.
40 dataref: The dataref is a hashref of hashrefs. The first key
41 is the target of the transformation, and the second is the
42 source type of the transformation, containing comma separated
43 values of the source type.
44 queryref: same structure as dataref, but instead of storing ids it stores a
45 1 if user requested intersect, or 0 for default union
46 Side Effects: will run refresh_matviews() if matviews aren't already populated
54 my $criteria_list = shift;
58 print STDERR
"criteria_list=" . Dumper
($criteria_list);
59 print STDERR
"dataref=" . Dumper
($dataref);
60 print STDERR
"queryref=" . Dumper
($queryref);
62 # Check if matviews are populated, and run refresh if they aren't. Which, as of postgres 9.5, will be the case when our databases are loaded from a dump. This should no longer be necessary once this bug is fixed in newer postgres versions
63 my ($status, %response_hash);
65 my $populated_query = "select * from materialized_phenoview limit 1";
66 my $sth = $self->dbh->prepare($populated_query);
68 } catch
{ #if test query fails because views aren't populated
69 print STDERR
"Using basic refresh to populate views . . .\n";
70 $status = $self->refresh_matviews($c->config->{dbhost
}, $c->config->{dbname
}, $c->config->{dbuser
}, $c->config->{dbpass
}, 'basic');
71 %response_hash = %$status;
74 if (%response_hash && $response_hash{'message'} eq 'Wizard update completed!') {
75 print STDERR
"Populated views, now proceeding with query . . . .\n";
76 } elsif (%response_hash && $response_hash{'message'} eq 'Wizard update initiated.') {
77 return { error
=> "The search wizard is temporarily unavailable while database indexes are being repopulated. Please try again later. Depending on the size of the database, it will be ready within a few seconds to an hour."};
78 } elsif (%response_hash && $response_hash{'error'}) {
79 return { error
=> $response_hash{'error'} };
82 my $target_table = $criteria_list->[-1];
83 print STDERR
"target_table=". $target_table . "\n";
84 my $target = $target_table;
87 my $select = "SELECT ".$target."_id, ".$target."_name ";
88 my $group = "GROUP BY ".$target."_id, ".$target."_name ";
91 if (!$dataref->{"$target_table"}) {
92 my $from = "FROM public.". $target_table;
93 my $where = " WHERE ".$target."_id IS NOT NULL";
94 $full_query = $select . $from . $where;
98 foreach my $category (@
$criteria_list) {
100 if ($dataref->{$criteria_list->[-1]}->{$category}) {
102 my @categories = ($target_table, $category);
103 @categories = sort @categories;
104 my $from = "FROM public.". $categories[0] ."x". $categories[1] . " JOIN public." . $target_table . " USING(" . $target."_id) ";
105 my $criterion = $category;
106 $criterion =~ s/s$//;
107 my $intersect = $queryref->{$criteria_list->[-1]}->{$category};
111 my @ids = split(/,/, $dataref->{$criteria_list->[-1]}->{$category});
112 foreach my $id (@ids) {
113 my $where = "WHERE ". $criterion. "_id IN (". $id .") ";
114 my $statement = $select . $from . $where . $group;
115 push @parts, $statement;
117 $query = join (" INTERSECT ", @parts);
118 push @queries, $query;
121 my $where = "WHERE ". $criterion. "_id IN (" . $dataref->{$criteria_list->[-1]}->{$category} . ") ";
122 $query = $select . $from . $where . $group;
123 push @queries, $query;
127 $full_query = join (" INTERSECT ", @queries);
129 $full_query .= " ORDER BY 2";
130 print STDERR
"QUERY: $full_query\n";
131 $h = $self->dbh->prepare($full_query);
135 while (my ($id, $name) = $h->fetchrow_array()) {
136 push @results, [ $id, $name ];
139 if (@results >= 10_000
) {
140 return { error
=> scalar(@results).' matches. Too many results to display' };
142 elsif (@results < 1) {
143 return { error
=> scalar(@results).' matches. No results to display' };
146 return { results
=> \
@results };
150 =head2 refresh_matviews
152 parameters: string to specify desired refresh type, basic or concurrent. defaults to concurrent
154 returns: message detailing success or error
156 Side Effects: Refreshes materialized views
160 sub refresh_matviews
{
167 my $refresh_type = shift || 'concurrent';
168 my $refresh_finished = 0;
171 my $q = "SELECT currently_refreshing FROM public.matviews WHERE mv_id=?";
172 my $h = $self->dbh->prepare($q);
175 my $refreshing = $h->fetchrow_array();
178 return { error
=> 'Wizard update already in progress . . . ' };
182 my $dbh = $self->dbh();
183 if ($refresh_type eq 'concurrent') {
184 #print STDERR "Using CXGN::Tools::Run to run perl bin/refresh_matviews.pl -H $dbhost -D $dbname -U $dbuser -P $dbpass -c";
185 $async_refresh = CXGN
::Tools
::Run
->run_async("perl bin/refresh_matviews.pl -H $dbhost -D $dbname -U $dbuser -P $dbpass -c");
187 print STDERR
"Using CXGN::Tools::Run to run perl bin/refresh_matviews.pl -H $dbhost -D $dbname -U $dbuser -P $dbpass";
188 $async_refresh = CXGN
::Tools
::Run
->run_async("perl bin/refresh_matviews.pl -H $dbhost -D $dbname -U $dbuser -P $dbpass");
191 for (my $i = 1; $i < 10; $i++) {
193 if ($async_refresh->alive) {
196 $refresh_finished = 1;
200 if ($refresh_finished) {
201 return { message
=> 'Wizard update completed!' };
203 return { message
=> 'Wizard update initiated.' };
206 print STDERR
'Error initiating wizard update.' . $@
. "\n";
207 return { error
=> 'Error initiating wizard update.' . $@
};
212 =head2 matviews_status
214 Desc: checks tracking table to see if materialized views are updating, and if not, when they were last updated.
218 returns: refreshing message or timestamp
224 sub matviews_status
{
226 my $q = "SELECT currently_refreshing, last_refresh FROM public.matviews WHERE mv_id=?";
227 my $h = $self->dbh->prepare($q);
230 my ($refreshing, $timestamp) = $h->fetchrow_array();
233 print STDERR
"Wizard is already refreshing, current status: $refreshing \n";
234 return { refreshing
=> "<p id='wizard_status'>Wizard update in progress . . . </p>"};
237 print STDERR
"materialized fullview last updated $timestamp\n";
238 return { timestamp
=> "<p id='wizard_status'>Wizard last updated: $timestamp</p>" };
242 sub get_phenotype_info
{
244 my $accession_sql = shift;
245 my $trial_sql = shift;
246 my $trait_sql = shift;
248 print STDERR
"GET_PHENOTYPE_INFO: $accession_sql - $trial_sql - $trait_sql \n\n";
250 my $rep_type_id = $self->get_stockprop_type_id("replicate");
251 my $block_number_type_id = $self -> get_stockprop_type_id
("block");
252 my $year_type_id = $self->get_projectprop_type_id("project year");
253 my $plot_type_id = $self->get_stock_type_id("plot");
254 my $plant_type_id = $self->get_stock_type_id("plant");
255 my $accession_type_id = $self->get_stock_type_id("accession");
257 my @where_clause = ();
258 if ($accession_sql) { push @where_clause, "stock.stock_id in ($accession_sql)"; }
259 if ($trial_sql) { push @where_clause, "project.project_id in ($trial_sql)"; }
260 if ($trait_sql) { push @where_clause, "cvterm.cvterm_id in ($trait_sql)"; }
262 my $where_clause = "";
264 if (@where_clause>0) {
265 $where_clause .= $rep_type_id ?
"WHERE (stockprop.type_id = $rep_type_id OR stockprop.type_id IS NULL) " : "WHERE stockprop.type_id IS NULL";
266 $where_clause .= "AND (plot.type_id = $plot_type_id OR plot.type_id = $plant_type_id) AND stock.type_id = $accession_type_id";
267 $where_clause .= $block_number_type_id ?
" AND (block_number.type_id = $block_number_type_id OR block_number.type_id IS NULL)" : " AND block_number.type_id IS NULL";
268 $where_clause .= $year_type_id ?
" AND projectprop.type_id = $year_type_id" :"" ;
269 $where_clause .= " AND " . (join (" AND " , @where_clause));
271 #$where_clause = "where (stockprop.type_id=$rep_type_id or stockprop.type_id IS NULL) AND (block_number.type_id=$block_number_type_id or block_number.type_id IS NULL) AND ".(join (" and ", @where_clause));
274 my $order_clause = " order by project.name, string_to_array(plot_number.value, '.')::int[]";
275 my $q = "SELECT projectprop.value, project.name, stock.uniquename, nd_geolocation.description, cvterm.name, phenotype.value, plot.uniquename, db.name, db.name || ':' || dbxref.accession AS accession, stockprop.value, block_number.value, cvterm.cvterm_id, project.project_id, nd_geolocation.nd_geolocation_id, stock.stock_id, plot.stock_id, phenotype.uniquename
276 FROM stock as plot JOIN stock_relationship ON (plot.stock_id=subject_id)
277 JOIN stock ON (object_id=stock.stock_id)
278 LEFT JOIN stockprop ON (plot.stock_id=stockprop.stock_id)
279 LEFT JOIN stockprop AS block_number ON (plot.stock_id=block_number.stock_id)
280 LEFT JOIN stockprop AS plot_number ON (plot.stock_id=plot_number.stock_id) AND plot_number.type_id = (SELECT cvterm_id from cvterm where cvterm.name = 'plot number')
281 JOIN nd_experiment_stock ON(nd_experiment_stock.stock_id=plot.stock_id)
282 JOIN nd_experiment ON (nd_experiment_stock.nd_experiment_id=nd_experiment.nd_experiment_id)
283 JOIN nd_geolocation USING(nd_geolocation_id)
284 JOIN nd_experiment_phenotype ON (nd_experiment_phenotype.nd_experiment_id=nd_experiment.nd_experiment_id)
285 JOIN phenotype USING(phenotype_id) JOIN cvterm ON (phenotype.cvalue_id=cvterm.cvterm_id)
287 JOIN dbxref ON (cvterm.dbxref_id = dbxref.dbxref_id)
289 JOIN nd_experiment_project ON (nd_experiment_project.nd_experiment_id=nd_experiment.nd_experiment_id)
290 JOIN project USING(project_id)
291 JOIN projectprop USING(project_id)
295 #print STDERR "QUERY: $q\n\n";
296 my $h = $self->dbh()->prepare($q);
300 while (my ($year, $project_name, $stock_name, $location, $trait, $value, $plot_name, $cv_name, $cvterm_accession, $rep, $block_number, $trait_id, $project_id, $location_id, $stock_id, $plot_id, $phenotype_uniquename) = $h->fetchrow_array()) {
301 push @
$result, [ $year, $project_name, $stock_name, $location, $trait, $value, $plot_name, $cv_name, $cvterm_accession, $rep, $block_number, $trait_id, $project_id, $location_id, $stock_id, $plot_id, $phenotype_uniquename ];
304 #print STDERR Dumper $result;
305 print STDERR
"QUERY returned ".scalar(@
$result)." rows.\n";
309 sub get_phenotype_info_matrix
{
311 my $accession_sql = shift;
312 my $trial_sql = shift;
313 my $trait_sql = shift;
315 my $data = $self->get_phenotype_info($accession_sql, $trial_sql, $trait_sql);
316 #data contains [$year, $project_name, $stock_name, $location, $trait, $value, $plot_name, $cv_name, $cvterm_accession, $rep, $block_number, $trait_id, $project_id, $location_id, $stock_id, $plot_id]
321 foreach my $d (@
$data) {
322 print STDERR
"PRINTING TRAIT DATA FOR TERM " . $d->[4] . "\n\n";
323 my $cvterm = $d->[4]."|".$d->[8];
324 my $trait_data = $d->[5];
326 $plot_data{$plot}->{$cvterm} = $trait_data;
333 # generate header line
335 my @sorted_traits = sort keys(%traits);
336 foreach my $trait (@sorted_traits) {
337 $line .= "\t".$trait; # first header has to be empty (plot name column)
341 # dump phenotypic values
344 foreach my $plot (sort keys (%plot_data)) {
347 foreach my $trait (@sorted_traits) {
348 my $tab = $plot_data{$plot}->{$trait}; # ? "\t".$plot_data{$plot}->{$trait} : "\t";
349 $line .= defined($tab) ?
"\t".$tab : "\t";
358 sub get_extended_phenotype_info_matrix
{
360 my $accession_sql = shift;
361 my $trial_sql = shift;
362 my $trait_sql = shift;
363 my $include_timestamp = shift // 0;
365 my $data = $self->get_phenotype_info($accession_sql, $trial_sql, $trait_sql);
366 #data contains [$year, $project_name, $stock_name, $location, $trait, $value, $plot_name, $cv_name, $cvterm_accession, $rep, $block_number, $trait_id, $project_id, $location_id, $stock_id, $plot_id, $phenotype_uniquename]
371 print STDERR
"No of lines retrieved: ".scalar(@
$data)."\n";
372 foreach my $d (@
$data) {
374 my ($year, $project_name, $stock_name, $location, $trait, $trait_data, $plot, $cv_name, $cvterm_accession, $rep, $block_number, $trait_id, $project_id, $location_id, $stock_id, $plot_id, $phenotype_uniquename) = @
$d;
376 my $cvterm = $d->[4]."|".$d->[8];
377 if ($include_timestamp) {
378 my ($p1, $p2) = split /date: /, $phenotype_uniquename;
379 my ($timestamp, $p3) = split / operator/, $p2;
380 if( $timestamp =~ m/(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})(\S)(\d{4})/) {
381 $plot_data{$plot}->{$cvterm} = "$trait_data,$timestamp";
383 $plot_data{$plot}->{$cvterm} = $trait_data;
386 $plot_data{$plot}->{$cvterm} = $trait_data;
389 if (!defined($rep)) { $rep = ""; }
390 $plot_data{$plot}->{metadata
} = {
392 studyName
=> $project_name,
393 germplasmName
=> $stock_name,
394 locationName
=> $location,
395 blockNumber
=> $block_number,
398 trait_data
=> $trait_data,
400 cvterm_id
=> $trait_id,
401 studyDbId
=> $project_id,
402 locationDbId
=> $location_id,
403 germplasmDbId
=> $stock_id,
408 #print STDERR Dumper \%plot_data;
411 my $line = join "\t", qw
| studyYear studyDbId studyName locationDbId locationName germplasmDbId germplasmName plotDbId plotName rep blockNumber
|;
413 # generate header line
415 my @sorted_traits = sort keys(%traits);
416 foreach my $trait (@sorted_traits) {
417 $line .= "\t".$trait;
421 # dump phenotypic values
425 my @unique_plot_list = ();
426 foreach my $d (keys \
%plot_data) {
427 push @unique_plot_list, $d;
429 #print STDERR Dumper \@unique_plot_list;
431 foreach my $p (@unique_plot_list) {
432 #$line = join "\t", map { $plot_data{$p}->{metadata}->{$_} } ( "year", "trial_name", "location", "accession", "plot", "rep", "block_number" );
433 $line = join "\t", map { $plot_data{$p}->{metadata
}->{$_} } ( "year", "studyDbId", "studyName", "locationDbId", "locationName", "germplasmDbId", "germplasmName", "plotDbId", "plotName", "rep", "blockNumber" );
435 #print STDERR "Adding line for plot $p\n";
436 foreach my $trait (@sorted_traits) {
437 my $tab = $plot_data{$p}->{$trait};
438 $line .= defined($tab) ?
"\t".$tab : "\t";
448 =head2 get_genotype_info
450 parameters: comma-separated lists of accession, trial, and trait IDs. May be empty.
452 returns: an array with genotype information
456 sub get_genotype_info
{
459 my $accession_idref = shift;
460 my $protocol_id = shift;
461 my $snp_genotype_id = shift || '76434';
462 my @accession_ids = @
$accession_idref;
463 my ($q, @result, $protocol_name);
465 if (@accession_ids) {
466 $q = "SELECT name, uniquename, value FROM (SELECT nd_protocol.name, stock.uniquename, genotypeprop.value, row_number() over (partition by stock.uniquename order by genotypeprop.genotype_id) as rownum from genotypeprop join nd_experiment_genotype USING (genotype_id) JOIN nd_experiment_protocol USING(nd_experiment_id) JOIN nd_protocol USING(nd_protocol_id) JOIN nd_experiment_stock USING(nd_experiment_id) JOIN stock USING(stock_id) WHERE genotypeprop.type_id = ? AND stock.stock_id in (@{[join',', ('?') x @accession_ids]}) AND nd_experiment_protocol.nd_protocol_id=?) tmp WHERE rownum <2";
468 print STDERR
"QUERY: $q\n\n";
470 my $h = $self->dbh()->prepare($q);
471 $h->execute($snp_genotype_id, @accession_ids,$protocol_id);
474 while (my($name,$uniquename,$genotype_string) = $h->fetchrow_array()) {
475 push @result, [ $uniquename, $genotype_string ];
476 $protocol_name = $name;
480 protocol_name
=> $protocol_name,
481 genotypes
=> \
@result
489 my $q = "SELECT projectprop.type_id FROM projectprop JOIN cvterm on (projectprop.type_id=cvterm.cvterm_id) WHERE cvterm.name='$term'";
490 my $h = $self->dbh->prepare($q);
492 my ($type_id) = $h->fetchrow_array();
497 sub get_stock_type_id
{
500 my $q = "SELECT stock.type_id FROM stock JOIN cvterm on (stock.type_id=cvterm.cvterm_id) WHERE cvterm.name='$term'";
501 my $h = $self->dbh->prepare($q);
503 my ($type_id) = $h->fetchrow_array();
507 sub get_stockprop_type_id
{
510 my $q = "SELECT stockprop.type_id FROM stockprop JOIN cvterm on (stockprop.type_id=cvterm.cvterm_id) WHERE cvterm.name=?";
511 my $h = $self->dbh->prepare($q);
513 my ($type_id) = $h->fetchrow_array();
517 sub get_projectprop_type_id
{
520 my $q = "SELECT projectprop.type_id FROM projectprop JOIN cvterm ON (projectprop.type_id=cvterm.cvterm_id) WHERE cvterm.name=?";
521 my $h = $self->dbh->prepare($q);
523 my ($type_id) = $h->fetchrow_array();