can download plant phenotype data in the same way as plot phenotype data
[sgn.git] / lib / CXGN / BreederSearch.pm
blobe4c8b2f7469b7693a446112a0934fb5db761852e
1 =head1 NAME
3 CXGN::BreederSearch - class for retrieving breeder information for the breeder search wizard
5 =head1 AUTHORS
7 Lukas Mueller <lam87@cornell.edu>
8 Aimin Yan <ay247@cornell.edu>
10 =head1 METHODS
12 =cut
14 package CXGN::BreederSearch;
16 use Moose;
17 use Data::Dumper;
18 use Try::Tiny;
20 has 'dbh' => (
21 is => 'rw',
22 required => 1,
24 has 'dbname' => (
25 is => 'rw',
26 isa => 'Str',
29 =head2 metadata_query
31 Usage: my %info = $bs->metadata_query($criteria_list, $dataref, $queryref);
32 Desc:
33 Ret: returns a hash with a key called results that contains
34 a listref of listrefs specifying the matching list with ids
35 and names.
36 Args: criteria_list: a comma separated string called a criteria_list,
37 listing all the criteria that need to be applied. Possible
38 criteria are trials, years, traits, and locations. The last
39 criteria in the list is the return type.
40 dataref: The dataref is a hashref of hashrefs. The first key
41 is the target of the transformation, and the second is the
42 source type of the transformation, containing comma separated
43 values of the source type.
44 queryref: same structure as dataref, but instead of storing ids it stores a
45 1 if user requested intersect, or 0 for default union
46 Side Effects: will run refresh_matviews() if matviews aren't already populated
47 Example:
49 =cut
51 sub metadata_query {
52 my $self = shift;
53 my $c = shift;
54 my $criteria_list = shift;
55 my $dataref = shift;
56 my $queryref = shift;
57 my $h;
58 print STDERR "criteria_list=" . Dumper($criteria_list);
59 print STDERR "dataref=" . Dumper($dataref);
60 print STDERR "queryref=" . Dumper($queryref);
62 # Check if matviews are populated, and run refresh if they aren't. Which, as of postgres 9.5, will be the case when our databases are loaded from a dump. This should no longer be necessary once this bug is fixed in newer postgres versions
63 my ($status, %response_hash);
64 try {
65 my $populated_query = "select * from materialized_phenoview limit 1";
66 my $sth = $self->dbh->prepare($populated_query);
67 $sth->execute();
68 } catch { #if test query fails because views aren't populated
69 print STDERR "Using basic refresh to populate views . . .\n";
70 $status = $self->refresh_matviews($c->config->{dbhost}, $c->config->{dbname}, $c->config->{dbuser}, $c->config->{dbpass}, 'basic');
71 %response_hash = %$status;
74 if (%response_hash && $response_hash{'message'} eq 'Wizard update completed!') {
75 print STDERR "Populated views, now proceeding with query . . . .\n";
76 } elsif (%response_hash && $response_hash{'message'} eq 'Wizard update initiated.') {
77 return { error => "The search wizard is temporarily unavailable while database indexes are being repopulated. Please try again later. Depending on the size of the database, it will be ready within a few seconds to an hour."};
78 } elsif (%response_hash && $response_hash{'error'}) {
79 return { error => $response_hash{'error'} };
82 my $target_table = $criteria_list->[-1];
83 print STDERR "target_table=". $target_table . "\n";
84 my $target = $target_table;
85 $target =~ s/s$//;
87 my $select = "SELECT ".$target."_id, ".$target."_name ";
88 my $group = "GROUP BY ".$target."_id, ".$target."_name ";
90 my $full_query;
91 if (!$dataref->{"$target_table"}) {
92 my $from = "FROM public.". $target_table;
93 my $where = " WHERE ".$target."_id IS NOT NULL";
94 $full_query = $select . $from . $where;
96 else {
97 my @queries;
98 foreach my $category (@$criteria_list) {
100 if ($dataref->{$criteria_list->[-1]}->{$category}) {
101 my $query;
102 my @categories = ($target_table, $category);
103 @categories = sort @categories;
104 my $from = "FROM public.". $categories[0] ."x". $categories[1] . " JOIN public." . $target_table . " USING(" . $target."_id) ";
105 my $criterion = $category;
106 $criterion =~ s/s$//;
107 my $intersect = $queryref->{$criteria_list->[-1]}->{$category};
109 if ($intersect) {
110 my @parts;
111 my @ids = split(/,/, $dataref->{$criteria_list->[-1]}->{$category});
112 foreach my $id (@ids) {
113 my $where = "WHERE ". $criterion. "_id IN (". $id .") ";
114 my $statement = $select . $from . $where . $group;
115 push @parts, $statement;
117 $query = join (" INTERSECT ", @parts);
118 push @queries, $query;
120 else {
121 my $where = "WHERE ". $criterion. "_id IN (" . $dataref->{$criteria_list->[-1]}->{$category} . ") ";
122 $query = $select . $from . $where . $group;
123 push @queries, $query;
127 $full_query = join (" INTERSECT ", @queries);
129 $full_query .= " ORDER BY 2";
130 print STDERR "QUERY: $full_query\n";
131 $h = $self->dbh->prepare($full_query);
132 $h->execute();
134 my @results;
135 while (my ($id, $name) = $h->fetchrow_array()) {
136 push @results, [ $id, $name ];
139 if (@results >= 10_000) {
140 return { error => scalar(@results).' matches. Too many results to display' };
142 elsif (@results < 1) {
143 return { error => scalar(@results).' matches. No results to display' };
145 else {
146 return { results => \@results };
150 =head2 refresh_matviews
152 parameters: string to specify desired refresh type, basic or concurrent. defaults to concurrent
154 returns: message detailing success or error
156 Side Effects: Refreshes materialized views
158 =cut
160 sub refresh_matviews {
162 my $self = shift;
163 my $dbhost = shift;
164 my $dbname = shift;
165 my $dbuser = shift;
166 my $dbpass = shift;
167 my $refresh_type = shift || 'concurrent';
168 my $refresh_finished = 0;
169 my $async_refresh;
171 my $q = "SELECT currently_refreshing FROM public.matviews WHERE mv_id=?";
172 my $h = $self->dbh->prepare($q);
173 $h->execute(1);
175 my $refreshing = $h->fetchrow_array();
177 if ($refreshing) {
178 return { error => 'Wizard update already in progress . . . ' };
180 else {
181 try {
182 my $dbh = $self->dbh();
183 if ($refresh_type eq 'concurrent') {
184 #print STDERR "Using CXGN::Tools::Run to run perl bin/refresh_matviews.pl -H $dbhost -D $dbname -U $dbuser -P $dbpass -c";
185 $async_refresh = CXGN::Tools::Run->run_async("perl bin/refresh_matviews.pl -H $dbhost -D $dbname -U $dbuser -P $dbpass -c");
186 } else {
187 print STDERR "Using CXGN::Tools::Run to run perl bin/refresh_matviews.pl -H $dbhost -D $dbname -U $dbuser -P $dbpass";
188 $async_refresh = CXGN::Tools::Run->run_async("perl bin/refresh_matviews.pl -H $dbhost -D $dbname -U $dbuser -P $dbpass");
191 for (my $i = 1; $i < 10; $i++) {
192 sleep($i/5);
193 if ($async_refresh->alive) {
194 next;
195 } else {
196 $refresh_finished = 1;
200 if ($refresh_finished) {
201 return { message => 'Wizard update completed!' };
202 } else {
203 return { message => 'Wizard update initiated.' };
205 } catch {
206 print STDERR 'Error initiating wizard update.' . $@ . "\n";
207 return { error => 'Error initiating wizard update.' . $@ };
212 =head2 matviews_status
214 Desc: checks tracking table to see if materialized views are updating, and if not, when they were last updated.
216 parameters: None.
218 returns: refreshing message or timestamp
220 Side Effects: none
222 =cut
224 sub matviews_status {
225 my $self = shift;
226 my $q = "SELECT currently_refreshing, last_refresh FROM public.matviews WHERE mv_id=?";
227 my $h = $self->dbh->prepare($q);
228 $h->execute(1);
230 my ($refreshing, $timestamp) = $h->fetchrow_array();
232 if ($refreshing) {
233 print STDERR "Wizard is already refreshing, current status: $refreshing \n";
234 return { refreshing => "<p id='wizard_status'>Wizard update in progress . . . </p>"};
236 else {
237 print STDERR "materialized fullview last updated $timestamp\n";
238 return { timestamp => "<p id='wizard_status'>Wizard last updated: $timestamp</p>" };
242 sub get_phenotype_info {
243 my $self = shift;
244 my $accession_sql = shift;
245 my $trial_sql = shift;
246 my $trait_sql = shift;
248 print STDERR "GET_PHENOTYPE_INFO: $accession_sql - $trial_sql - $trait_sql \n\n";
250 my $rep_type_id = $self->get_stockprop_type_id("replicate");
251 my $block_number_type_id = $self -> get_stockprop_type_id("block");
252 my $year_type_id = $self->get_projectprop_type_id("project year");
253 my $plot_type_id = $self->get_stock_type_id("plot");
254 my $plant_type_id = $self->get_stock_type_id("plant");
255 my $accession_type_id = $self->get_stock_type_id("accession");
257 my @where_clause = ();
258 if ($accession_sql) { push @where_clause, "stock.stock_id in ($accession_sql)"; }
259 if ($trial_sql) { push @where_clause, "project.project_id in ($trial_sql)"; }
260 if ($trait_sql) { push @where_clause, "cvterm.cvterm_id in ($trait_sql)"; }
262 my $where_clause = "";
264 if (@where_clause>0) {
265 $where_clause .= $rep_type_id ? "WHERE (stockprop.type_id = $rep_type_id OR stockprop.type_id IS NULL) " : "WHERE stockprop.type_id IS NULL";
266 $where_clause .= "AND (plot.type_id = $plot_type_id OR plot.type_id = $plant_type_id) AND stock.type_id = $accession_type_id";
267 $where_clause .= $block_number_type_id ? " AND (block_number.type_id = $block_number_type_id OR block_number.type_id IS NULL)" : " AND block_number.type_id IS NULL";
268 $where_clause .= $year_type_id ? " AND projectprop.type_id = $year_type_id" :"" ;
269 $where_clause .= " AND " . (join (" AND " , @where_clause));
271 #$where_clause = "where (stockprop.type_id=$rep_type_id or stockprop.type_id IS NULL) AND (block_number.type_id=$block_number_type_id or block_number.type_id IS NULL) AND ".(join (" and ", @where_clause));
274 my $order_clause = " order by project.name, string_to_array(plot_number.value, '.')::int[]";
275 my $q = "SELECT projectprop.value, project.name, stock.uniquename, nd_geolocation.description, cvterm.name, phenotype.value, plot.uniquename, db.name, db.name || ':' || dbxref.accession AS accession, stockprop.value, block_number.value, cvterm.cvterm_id, project.project_id, nd_geolocation.nd_geolocation_id, stock.stock_id, plot.stock_id, phenotype.uniquename
276 FROM stock as plot JOIN stock_relationship ON (plot.stock_id=subject_id)
277 JOIN stock ON (object_id=stock.stock_id)
278 LEFT JOIN stockprop ON (plot.stock_id=stockprop.stock_id)
279 LEFT JOIN stockprop AS block_number ON (plot.stock_id=block_number.stock_id)
280 LEFT JOIN stockprop AS plot_number ON (plot.stock_id=plot_number.stock_id) AND plot_number.type_id = (SELECT cvterm_id from cvterm where cvterm.name = 'plot number')
281 JOIN nd_experiment_stock ON(nd_experiment_stock.stock_id=plot.stock_id)
282 JOIN nd_experiment ON (nd_experiment_stock.nd_experiment_id=nd_experiment.nd_experiment_id)
283 JOIN nd_geolocation USING(nd_geolocation_id)
284 JOIN nd_experiment_phenotype ON (nd_experiment_phenotype.nd_experiment_id=nd_experiment.nd_experiment_id)
285 JOIN phenotype USING(phenotype_id) JOIN cvterm ON (phenotype.cvalue_id=cvterm.cvterm_id)
286 JOIN cv USING(cv_id)
287 JOIN dbxref ON (cvterm.dbxref_id = dbxref.dbxref_id)
288 JOIN db USING(db_id)
289 JOIN nd_experiment_project ON (nd_experiment_project.nd_experiment_id=nd_experiment.nd_experiment_id)
290 JOIN project USING(project_id)
291 JOIN projectprop USING(project_id)
292 $where_clause
293 $order_clause";
295 #print STDERR "QUERY: $q\n\n";
296 my $h = $self->dbh()->prepare($q);
297 $h->execute();
299 my $result = [];
300 while (my ($year, $project_name, $stock_name, $location, $trait, $value, $plot_name, $cv_name, $cvterm_accession, $rep, $block_number, $trait_id, $project_id, $location_id, $stock_id, $plot_id, $phenotype_uniquename) = $h->fetchrow_array()) {
301 push @$result, [ $year, $project_name, $stock_name, $location, $trait, $value, $plot_name, $cv_name, $cvterm_accession, $rep, $block_number, $trait_id, $project_id, $location_id, $stock_id, $plot_id, $phenotype_uniquename ];
304 #print STDERR Dumper $result;
305 print STDERR "QUERY returned ".scalar(@$result)." rows.\n";
306 return $result;
309 sub get_phenotype_info_matrix {
310 my $self = shift;
311 my $accession_sql = shift;
312 my $trial_sql = shift;
313 my $trait_sql = shift;
315 my $data = $self->get_phenotype_info($accession_sql, $trial_sql, $trait_sql);
316 #data contains [$year, $project_name, $stock_name, $location, $trait, $value, $plot_name, $cv_name, $cvterm_accession, $rep, $block_number, $trait_id, $project_id, $location_id, $stock_id, $plot_id]
318 my %plot_data;
319 my %traits;
321 foreach my $d (@$data) {
322 print STDERR "PRINTING TRAIT DATA FOR TERM " . $d->[4] . "\n\n";
323 my $cvterm = $d->[4]."|".$d->[8];
324 my $trait_data = $d->[5];
325 my $plot = $d->[6];
326 $plot_data{$plot}->{$cvterm} = $trait_data;
327 $traits{$cvterm}++;
330 my @info = ();
331 my $line = "";
333 # generate header line
335 my @sorted_traits = sort keys(%traits);
336 foreach my $trait (@sorted_traits) {
337 $line .= "\t".$trait; # first header has to be empty (plot name column)
339 push @info, $line;
341 # dump phenotypic values
343 my $count2 = 0;
344 foreach my $plot (sort keys (%plot_data)) {
345 $line = $plot;
347 foreach my $trait (@sorted_traits) {
348 my $tab = $plot_data{$plot}->{$trait}; # ? "\t".$plot_data{$plot}->{$trait} : "\t";
349 $line .= defined($tab) ? "\t".$tab : "\t";
352 push @info, $line;
355 return @info;
358 sub get_extended_phenotype_info_matrix {
359 my $self = shift;
360 my $accession_sql = shift;
361 my $trial_sql = shift;
362 my $trait_sql = shift;
363 my $include_timestamp = shift // 0;
365 my $data = $self->get_phenotype_info($accession_sql, $trial_sql, $trait_sql);
366 #data contains [$year, $project_name, $stock_name, $location, $trait, $value, $plot_name, $cv_name, $cvterm_accession, $rep, $block_number, $trait_id, $project_id, $location_id, $stock_id, $plot_id, $phenotype_uniquename]
368 my %plot_data;
369 my %traits;
371 print STDERR "No of lines retrieved: ".scalar(@$data)."\n";
372 foreach my $d (@$data) {
374 my ($year, $project_name, $stock_name, $location, $trait, $trait_data, $plot, $cv_name, $cvterm_accession, $rep, $block_number, $trait_id, $project_id, $location_id, $stock_id, $plot_id, $phenotype_uniquename) = @$d;
376 my $cvterm = $d->[4]."|".$d->[8];
377 if ($include_timestamp) {
378 my ($p1, $p2) = split /date: /, $phenotype_uniquename;
379 my ($timestamp, $p3) = split / operator/, $p2;
380 if( $timestamp =~ m/(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})(\S)(\d{4})/) {
381 $plot_data{$plot}->{$cvterm} = "$trait_data,$timestamp";
382 } else {
383 $plot_data{$plot}->{$cvterm} = $trait_data;
385 } else {
386 $plot_data{$plot}->{$cvterm} = $trait_data;
389 if (!defined($rep)) { $rep = ""; }
390 $plot_data{$plot}->{metadata} = {
391 rep => $rep,
392 studyName => $project_name,
393 germplasmName => $stock_name,
394 locationName => $location,
395 blockNumber => $block_number,
396 plotName => $plot,
397 cvterm => $cvterm,
398 trait_data => $trait_data,
399 year => $year,
400 cvterm_id => $trait_id,
401 studyDbId => $project_id,
402 locationDbId => $location_id,
403 germplasmDbId => $stock_id,
404 plotDbId => $plot_id
406 $traits{$cvterm}++;
408 #print STDERR Dumper \%plot_data;
410 my @info = ();
411 my $line = join "\t", qw | studyYear studyDbId studyName locationDbId locationName germplasmDbId germplasmName plotDbId plotName rep blockNumber |;
413 # generate header line
415 my @sorted_traits = sort keys(%traits);
416 foreach my $trait (@sorted_traits) {
417 $line .= "\t".$trait;
419 push @info, $line;
421 # dump phenotypic values
423 my $count2 = 0;
425 my @unique_plot_list = ();
426 foreach my $d (keys \%plot_data) {
427 push @unique_plot_list, $d;
429 #print STDERR Dumper \@unique_plot_list;
431 foreach my $p (@unique_plot_list) {
432 #$line = join "\t", map { $plot_data{$p}->{metadata}->{$_} } ( "year", "trial_name", "location", "accession", "plot", "rep", "block_number" );
433 $line = join "\t", map { $plot_data{$p}->{metadata}->{$_} } ( "year", "studyDbId", "studyName", "locationDbId", "locationName", "germplasmDbId", "germplasmName", "plotDbId", "plotName", "rep", "blockNumber" );
435 #print STDERR "Adding line for plot $p\n";
436 foreach my $trait (@sorted_traits) {
437 my $tab = $plot_data{$p}->{$trait};
438 $line .= defined($tab) ? "\t".$tab : "\t";
440 push @info, $line;
443 return @info;
448 =head2 get_genotype_info
450 parameters: comma-separated lists of accession, trial, and trait IDs. May be empty.
452 returns: an array with genotype information
454 =cut
456 sub get_genotype_info {
458 my $self = shift;
459 my $accession_idref = shift;
460 my $protocol_id = shift;
461 my $snp_genotype_id = shift || '76434';
462 my @accession_ids = @$accession_idref;
463 my ($q, @result, $protocol_name);
465 if (@accession_ids) {
466 $q = "SELECT name, uniquename, value FROM (SELECT nd_protocol.name, stock.uniquename, genotypeprop.value, row_number() over (partition by stock.uniquename order by genotypeprop.genotype_id) as rownum from genotypeprop join nd_experiment_genotype USING (genotype_id) JOIN nd_experiment_protocol USING(nd_experiment_id) JOIN nd_protocol USING(nd_protocol_id) JOIN nd_experiment_stock USING(nd_experiment_id) JOIN stock USING(stock_id) WHERE genotypeprop.type_id = ? AND stock.stock_id in (@{[join',', ('?') x @accession_ids]}) AND nd_experiment_protocol.nd_protocol_id=?) tmp WHERE rownum <2";
468 print STDERR "QUERY: $q\n\n";
470 my $h = $self->dbh()->prepare($q);
471 $h->execute($snp_genotype_id, @accession_ids,$protocol_id);
474 while (my($name,$uniquename,$genotype_string) = $h->fetchrow_array()) {
475 push @result, [ $uniquename, $genotype_string ];
476 $protocol_name = $name;
479 return {
480 protocol_name => $protocol_name,
481 genotypes => \@result
486 sub get_type_id {
487 my $self = shift;
488 my $term = shift;
489 my $q = "SELECT projectprop.type_id FROM projectprop JOIN cvterm on (projectprop.type_id=cvterm.cvterm_id) WHERE cvterm.name='$term'";
490 my $h = $self->dbh->prepare($q);
491 $h->execute();
492 my ($type_id) = $h->fetchrow_array();
493 return $type_id;
497 sub get_stock_type_id {
498 my $self = shift;
499 my $term =shift;
500 my $q = "SELECT stock.type_id FROM stock JOIN cvterm on (stock.type_id=cvterm.cvterm_id) WHERE cvterm.name='$term'";
501 my $h = $self->dbh->prepare($q);
502 $h->execute();
503 my ($type_id) = $h->fetchrow_array();
504 return $type_id;
507 sub get_stockprop_type_id {
508 my $self = shift;
509 my $term = shift;
510 my $q = "SELECT stockprop.type_id FROM stockprop JOIN cvterm on (stockprop.type_id=cvterm.cvterm_id) WHERE cvterm.name=?";
511 my $h = $self->dbh->prepare($q);
512 $h->execute($term);
513 my ($type_id) = $h->fetchrow_array();
514 return $type_id;
517 sub get_projectprop_type_id {
518 my $self = shift;
519 my $term = shift;
520 my $q = "SELECT projectprop.type_id FROM projectprop JOIN cvterm ON (projectprop.type_id=cvterm.cvterm_id) WHERE cvterm.name=?";
521 my $h = $self->dbh->prepare($q);
522 $h->execute($term);
523 my ($type_id) = $h->fetchrow_array();
524 return $type_id;