start fixing test for multi cat phenotype upload.
[sgn.git] / lib / CXGN / Genotype / SequenceMetadata.pm
blobe0d69474d12df00c7ccbd50cf8cdca4ac133b1e7
1 package CXGN::Genotype::SequenceMetadata;
3 =head1 NAME
5 CXGN::Genotype::SequenceMetadata - used to manage sequence metadata in the featureprop_json table
7 =head1 USAGE
9 To preprocess and verify a gff file for the featureprop_json table:
10 my $smd = CXGN::Genotype::SequenceMetadata->new(bcs_schema => $schema);
11 my $verification_results = $smd->verify($original_filepath, $processed_filepath, 'Triticum aestivum');
14 To create a new sequence metadata protocol and store a gff file to the featureprop_json table:
15 (bcs_schema and type_id are required)
16 my $smd = CXGN::Genotype::SequenceMetadata->new(bcs_schema => $schema, type_id => $cvterm_id);
18 1) create a new sequence metadata protocol
19 (protocol name, protocol description, and reference genome are required)
20 my %protocol_args = (
21 protocol_name => $protocol_name,
22 protocol_description => $protocol_description,
23 reference_genome => $protocol_reference_genome,
24 score_description => $protocol_score_description,
25 attributes => \%attributes
27 my $protocol_results = $smd->create_protocol(\%protocol_args);
29 2) store the preprocessed gff file
30 my $store_results = $smd->store($processed_filepath, 'Triticum aestivum');
33 To use an existing sequence metadata protocol and store a gff file to the featureprop_json table:
34 (bcs_schema, type_id, and nd_protocol_id are required)
35 my $smd = CXGN::Genotype::SequenceMetadata->new(bcs_schema => $schema, type_id => $cvterm_id, nd_protocol_id => $nd_protocol_id);
36 my $store_results = $smd->store($processed_filepath, 'Triticum aestivum');
39 To query the stored sequence metadata:
40 (bcs_schema and feature_id are required, all other filters are optional)
41 my $smd = CXGN::Genotype::SequenceMetadata->new(bcs_schema => $schema)
42 my @attributes = (
44 key => 'score',
45 nd_protocol_id => 200,
46 comparison => 'lt',
47 value => 0
50 key => 'trait',
51 nd_protocol_id => 201,
52 comparison => 'eq',
53 value => 'grain yield'
56 my $query_results = $smd->query({
57 feature_id => $feature_id,
58 start => $start_pos,
59 end => $end_pos,
60 reference_genome => $reference_genome,
61 type_ids => \@type_ids,
62 nd_protocol_ids => \@nd_protocol_ids,
63 attributes => \@attributes
64 });
66 =head1 DESCRIPTION
69 =head1 AUTHORS
71 David Waring <djw64@cornell.edu>
73 =cut
76 use strict;
77 use warnings;
78 use Moose;
79 use JSON;
81 use SGN::Context;
82 use SGN::Model::Cvterm
85 has 'shell_script_dir' => (
86 isa => 'Str',
87 is => 'ro',
88 default => '/bin/sequence_metadata'
91 has 'bcs_schema' => (
92 isa => 'Bio::Chado::Schema',
93 is => 'rw',
94 required => 1
97 has 'type_id' => (
98 isa => 'Int|Undef',
99 is => 'rw'
102 has 'nd_protocol_id' => (
103 isa => 'Int|Undef',
104 is => 'rw'
109 sub BUILD {
110 my $self = shift;
115 # Preprocess and verify a gff file for the featureprop_json table
116 # - Remove comments from the original file
117 # - Sort the file by seqid and start position
118 # - Check to make sure all of the seqid's exist as features in the database
119 # - the features' organism name must match the provided species argument
120 # - Check if protocol attributes exist in the file
121 # - Remove the original file, if processed successfully
123 # Arguments:
124 # - input: full filepath to the original gff3 file
125 # - output: full filepath to the processed gff3 file generated by this script
126 # - species: name of the species to use when checking for existing features (must match feature organism name)
127 # - protocol_attributes: arrayref of attributes specified by the protocol
129 # Returns a hash with the following keys:
130 # - processed: 0 if processing fails, 1 if it succeeds
131 # - verified: 0 if verification fails, 1 if it succeeds
132 # - missing_features: list of missing features
133 # - missing_attributes: list of attributes in the protocol that are not in the uploaded file
134 # - undefined attributes: list attributes in the uploaded file not defined by the protocol
135 # - error: error message
137 sub verify {
138 my $self = shift;
139 my $input = shift;
140 my $output = shift;
141 my $species = shift;
142 my $protocol_attributes = shift;
143 my $c = SGN::Context->new;
145 my %results = (
146 processed => 0,
147 verified => 0
150 # PROCESS THE INPUT FILE
151 # Remove comments
152 # Sort by seqid and start
153 # Save to output file
154 my $script = $c->get_conf('basepath') . $self->shell_script_dir . "/preprocess_featureprop_json.sh";
155 my $cmd = "bash " . $script . " \"" . $input . "\" \"" . $output . "\"";
156 my $rv = system($cmd);
157 if ($rv == -1) {
158 $results{'error'} = "Could not launch pre-processing script: $!";
160 elsif (my $s = $rv & 127) {
161 $results{'error'} = "Pre-processing script died from signal $s";
163 elsif (my $e = $rv >> 8) {
164 $results{'error'} = "Pre-processing script exited with code $e";
167 # VERIFY THE FEATURES
168 # Get a unique list of all seqid's
169 # Make sure each seqid matches a feature in the database
170 if ( $rv == 0 ) {
171 $results{'processed'} = 1;
173 # Remove original file
174 unlink $input;
176 # Get unique list of features
177 my $script = $c->get_conf('basepath') . $self->shell_script_dir . "/get_unique_features.sh";
178 my $cmd = "bash " . $script . " \"" . $output . "\"";
179 my @features = `$cmd`;
181 # Check each feature in the database
182 my @missing = ();
183 foreach my $feature ( @features ) {
184 chomp($feature);
185 my $query = "SELECT feature_id
186 FROM public.feature
187 LEFT JOIN public.organism USING (organism_id)
188 WHERE uniquename=?
189 AND CONCAT(organism.genus, ' ', REGEXP_REPLACE(organism.species, CONCAT('^', organism.genus, ' '), ''))=?;";
190 my $sth = $self->bcs_schema->storage->dbh()->prepare($query);
191 $sth->execute($feature, $species);
192 my ($feature_id) = $sth->fetchrow_array();
193 if ( $feature_id eq "" ) {
194 push(@missing, $feature . ' (' . $species . ')');
197 my $missing_count = scalar(@missing);
198 $results{'missing_features'} = \@missing;
200 # Verified successfully if no missing features
201 if ( $missing_count == 0 ) {
202 $results{'verified'} = 1;
206 # VERIFY THE ATTRIBUTES
207 # Get file attributes
208 my @missing_attributes = ();
209 my @undefined_attributes = ();
210 my $attributes_script = $c->get_conf('basepath') . $self->shell_script_dir . "/get_unique_attributes.sh";
211 my $attributes_cmd = "bash " . $attributes_script . " \"" . $output . "\"";
212 my @file_attributes = `$attributes_cmd`;
213 chomp(@file_attributes);
215 # Compare file and protocol attributes
216 my $has_undefined_attributes = 0;
217 foreach my $file_attribute ( @file_attributes ) {
218 if ( !grep( /^$file_attribute$/, @$protocol_attributes ) ) {
219 push(@undefined_attributes, $file_attribute);
220 $has_undefined_attributes = 1;
223 my $has_missing_attributes = 0;
224 foreach my $protocol_attribute ( @$protocol_attributes ) {
225 if ( !grep( /^$protocol_attribute$/, @file_attributes) ) {
226 push(@missing_attributes, $protocol_attribute);
227 $has_missing_attributes = 1;
231 # Set attribute response(s)
232 if ( $has_undefined_attributes ) {
233 $results{'undefined_attributes'} = \@undefined_attributes;
235 if ( $has_missing_attributes ) {
236 $results{'missing_attributes'} = \@missing_attributes;
240 return(\%results);
245 # Create a new Sequence Metadata Protocol with the specified properties
247 # Arguments: as a hash with the following keys:
248 # - protocol_name = the name of the sequence metadata protocol
249 # - protocol_description = the description of the sequence metadata protocol
250 # - reference_genome = the name of the reference genome used by the sequence metadata positions
251 # - score_description = (optional) the description of the store value
252 # - attributes = (optional) a hashref of attributes (keys) and their descriptions (values)
253 # - links = (optional) a hashref of external links (keys = title, values = URL template)
255 # Returns a hash with the following keys:
256 # - error: error message
257 # - created: 0 if creation failed, 1 if it succeeds
258 # - nd_protocol_id: the id of the new protocol
260 sub create_protocol {
261 my $self = shift;
262 my $args = shift;
263 my $protocol_name = $args->{'protocol_name'};
264 my $protocol_description = $args->{'protocol_description'};
265 my $reference_genome = $args->{'reference_genome'};
266 my $score_description = $args->{'score_description'};
267 my $attributes = $args->{'attributes'};
268 my $links = $args->{'links'};
270 # Protocol Results
271 my %results = (
272 created => 0,
273 nd_protocol_id => 0
276 # Make sure SMD type_id is set
277 if ( !defined $self->type_id || $self->type_id eq '' ) {
278 $results{'error'} = "Sequence Metadata type_id not set!";
279 return(\%results);
282 # Check required arguments
283 if ( !$protocol_name || $protocol_name eq '' ) {
284 $results{'error'} = "Protocol name not set!";
285 return(\%results);
287 if ( !$protocol_description || $protocol_description eq '' ) {
288 $results{'error'} = "Protocol description not set!";
289 return(\%results);
291 if ( !$reference_genome || $reference_genome eq '' ) {
292 $results{'error'} = "Reference genome not set!";
293 return(\%results);
296 # Get type name from type_id
297 my $sequence_metadata_type = $self->bcs_schema->resultset("Cv::Cvterm")->find({ cvterm_id=>$self->type_id })->name();
299 # Set the protocol props
300 my %sequence_metadata_protocol_props = (
301 sequence_metadata_type_id => $self->type_id,
302 sequence_metadata_type => $sequence_metadata_type,
303 reference_genome => $reference_genome
305 if ( defined $score_description && $score_description ne '' ) {
306 $sequence_metadata_protocol_props{'score_description'} = $score_description;
308 if ( defined $attributes ) {
309 $sequence_metadata_protocol_props{'attribute_descriptions'} = $attributes;
311 if ( defined $links ) {
312 $sequence_metadata_protocol_props{'links'} = $links;
315 # Create Protocol with props
316 my $sequence_metadata_protocol_type_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'sequence_metadata_protocol', 'protocol_type')->cvterm_id();
317 my $sequence_metadata_protocol_prop_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'sequence_metadata_protocol_properties', 'protocol_property')->cvterm_id();
318 my $protocol = $self->bcs_schema->resultset('NaturalDiversity::NdProtocol')->create({
319 name => $protocol_name,
320 type_id => $sequence_metadata_protocol_type_id,
321 nd_protocolprops => [{type_id => $sequence_metadata_protocol_prop_cvterm_id, value => encode_json \%sequence_metadata_protocol_props}]
323 my $protocol_id = $protocol->nd_protocol_id();
325 # Update protocol description
326 my $dbh = $self->bcs_schema->storage->dbh();
327 my $sql = "UPDATE nd_protocol SET description=? WHERE nd_protocol_id=?;";
328 my $sth = $dbh->prepare($sql);
329 $sth->execute($protocol_description, $protocol_id);
331 # Set the protocol id
332 $self->nd_protocol_id($protocol_id);
334 # Return Results
335 $results{'created'} = 1;
336 $results{'nd_protocol_id'} = $protocol_id;
337 return(\%results);
343 # Store sequence metadata from a gff file to the featureprop_json table
345 # Arguments:
346 # - input: the full filepath to the gff3 file to parse
347 # - species: the name of the species to be used when finding matching features from the input file
348 # - chunk_size: (optional) max number of items to store in a single database row (default 8000)
350 # Returns a hash with the following keys:
351 # - error: error message
352 # - stored: 0 if storing failed, 1 if it succeeds
353 # - chunks: the number of chunks stored
355 sub store {
356 my $self = shift;
357 my $input = shift;
358 my $species = shift;
359 my $chunk_size = shift;
361 my %results = (
362 stored => 0,
363 chunks => 0
366 # Make sure type_id and nd_protocol_id are set
367 if ( !defined $self->type_id || $self->type_id eq '' ) {
368 $results{'error'} = "Sequence Metadata type_id not set!";
369 return(\%results);
371 if ( !defined $self->nd_protocol_id || $self->nd_protocol_id eq '' ) {
372 $results{'error'} = "Sequence Metadata nd_protocol_id not set!";
373 return(\%results);
376 # Set default chunk size
377 if ( !defined $chunk_size || $chunk_size eq '' ) {
378 $chunk_size = 8000;
381 # Check cvterm id
382 my $cvterm = $self->bcs_schema->resultset("Cv::Cvterm")->find({ cvterm_id => $self->type_id });
383 if ( !$cvterm ) {
384 $results{'error'} = "No matching cvterm found for the specified type id";
385 return(\%results);
388 # Check nd protocol id
389 my $nd_protocol = $self->bcs_schema->resultset("NaturalDiversity::NdProtocol")->find({ nd_protocol_id => $self->nd_protocol_id });
390 if ( !$nd_protocol ) {
391 $results{'error'} = "No matching nd protocol found for the specified nd protocol id";
392 return(\%results);
397 # Open the input file
398 open(my $fh, '<', $input) or die "Could not open input file\n";
400 # Properties of the current chunk
401 my $chunk_feature = undef; # the name of the chunk's feature (if the current line's feature name is different, start a new chunk)
402 my $chunk_start = undef; # the min start position of the chunk's contents
403 my $chunk_end = undef; # the max end position of the chunk's contents
404 my @chunk_values = (); # the chunk's values (to be converted to JSON array)
405 my $chunk_count = 0; # the number of items in the chunk (if the count exceeds the chunk_size, start a new chunk)
406 my $total = 0; # the total number of chunks
408 # Parse the input by line
409 while ( defined(my $line = <$fh>) ) {
410 chomp $line;
411 next if ( $line =~ /^#/ );
413 # Get data from line
414 my @data = split(/\t/, $line);
415 my $feature = $data[0] ne "." ? $data[0] : "";
416 my $start = $data[3] ne "." ? $data[3] : "";
417 my $end = $data[4] ne "." ? $data[4] : "";
418 my $score = $data[5] ne "." ? $data[5] : "";
419 my $attributes = $data[8] ne "." ? $data[8] : "";
421 # Put unknown start / stop positions at 0
422 $start = $start eq "" ? 0 : $start;
423 $end = $end eq "" ? 0 : $end;
425 # Write the current chunk to the database
426 # when the feature changes or the chunk size has been reached
427 if ( ($chunk_feature && $feature ne $chunk_feature) || $chunk_count > $chunk_size ) {
428 _write_chunk($self, $chunk_feature, $species, \@chunk_values, $chunk_start, $chunk_end);
430 # Reset chunk properties
431 $chunk_feature = undef;
432 $chunk_start = undef;
433 $chunk_end = undef;
434 @chunk_values = ();
435 $chunk_count = 0;
436 $total++;
439 # Parse attributes
440 my %attribute_hash = ();
441 if ( $attributes ne "." ) {
442 my @as = split(/;/, $attributes);
443 for my $a (@as) {
444 my @kv = split(/=/, $a);
445 my $key = $kv[0];
446 my $value = $kv[1];
447 ($value) = $value =~ /^"?([^"]*)"?$/;
448 if ( $key eq "score" || $key eq "start" || $key eq "end" ) {
449 $results{'error'} = "Line has reserved key in attribute list (attributes cannot use keys of 'score', 'start' or 'end')";
450 return(\%results);
452 $attribute_hash{$key} = $value;
456 # Set chunk properties
457 if ( !$chunk_feature ) {
458 $chunk_feature = $feature;
460 if ( !$chunk_start || $start < $chunk_start ) {
461 $chunk_start = $start
463 if ( !$chunk_end || $end > $chunk_end ) {
464 $chunk_end = $end;
466 my %value = ( score => $score, start => $start, end => $end );
467 %value = (%value, %attribute_hash);
469 push @chunk_values, \%value;
470 $chunk_count++;
473 # Write the last chunk
474 _write_chunk($self, $chunk_feature, $species, \@chunk_values, $chunk_start, $chunk_end);
475 $total++;
477 # Return the number of chunks written
478 $results{'stored'} = 1;
479 $results{'chunks'} = $total;
480 return(\%results);
485 # Write a chunk / row of sequence metadata to the database
487 sub _write_chunk() {
488 my $self = shift;
489 my $chunk_feature = shift;
490 my $species = shift;
491 my $chunk_values = shift;
492 my $chunk_start = shift;
493 my $chunk_end = shift;
495 my $dbh = $self->bcs_schema->storage->dbh();
496 my $type_id = $self->type_id;
497 my $nd_protocol_id = $self->nd_protocol_id;
499 # Get Feature ID
500 my $query = "SELECT feature_id
501 FROM public.feature
502 LEFT JOIN public.organism USING (organism_id)
503 WHERE uniquename=?
504 AND CONCAT(organism.genus, ' ', REGEXP_REPLACE(organism.species, CONCAT('^', organism.genus, ' '), ''))=?;";
505 my $sth = $dbh->prepare($query);
506 $sth->execute($chunk_feature, $species);
507 my ($feature_id) = $sth->fetchrow_array();
509 # Check Feature ID
510 if ( !$feature_id || $feature_id eq "" ) {
511 die "No matching feature for specified seqid [$chunk_feature ($species)]\n";
514 # Convert values to JSON array string
515 my $json_str = encode_json($chunk_values);
517 # Insert into the database
518 my $insert = "INSERT INTO public.featureprop_json (feature_id, type_id, nd_protocol_id, start_pos, end_pos, json) VALUES (?, ?, ?, ?, ?, ?);";
519 my $ih = $dbh->prepare($insert);
520 $ih->execute($feature_id, $type_id, $nd_protocol_id, $chunk_start, $chunk_end, $json_str);
525 # Query the sequence metadata stored in the featureprop_json table with the provided filter parameters
527 # Arguments (as a hash with the following keys):
528 # - feature_id = id of feature associated with the sequence metadata
529 # - start = (optional) start position of query region (default: 0)
530 # - end = (optional) end position of query region (default: feature max)
531 # - reference_genome = (required if start and/or end provided) the reference genome name of the start and/or end positions
532 # - type_ids = (optional) array of sequence metadata cvterm ids (default: object type_id, if defined, or include all)
533 # - nd_protocol_ids = (optional) array of nd_protocol_ids (default: object nd_protocol_id, if defined, or include all)
534 # - attributes = (optional) an array of attribute properties to include in the filter (default: none)
535 # the attribute properties are a hash with the following keys:
536 # - key = attribute key (score, secondary attribute key, etc)
537 # - nd_protocol_id = the id of the protocol to apply this attribute filter to
538 # - comparison = one of: con, eq, lt, lte, gt, gte to use as the comparison of the stored value to the specified value
539 # - value = the value to use in the comparison
541 # Returns a hash with the following keys:
542 # - error: an error message, if an error was encountered
543 # - results: an array of sequence metadata objects with the following keys:
544 # - featureprop_json_id = id of associated featureprop_json chunk
545 # - feature_id = id of associated feature
546 # - feature_name = name of associated feature
547 # - type_id = cvterm_id of sequence metadata type
548 # - type_name = name of sequence metadata type
549 # - nd_protocol_id = id of associated nd_protocol
550 # - nd_protocol_name = name of associated nd_protocol
551 # - start = start position of sequence metadata
552 # - end = end position of sequence metadata
553 # - score = primary score value of sequence metadata
554 # - attributes = hash of secondary key/value attributes
555 # - links = hash of external link titles and URLs
557 sub query {
558 my ($self, $args) = @_;
559 my $feature_id = $args->{feature_id};
560 my $start = $args->{start};
561 my $end = $args->{end};
562 my $reference_genome = $args->{reference_genome};
563 my $min_score = $args->{min_score};
564 my $max_score = $args->{max_score};
565 my $type_ids = $args->{type_ids};
566 my $nd_protocol_ids = $args->{nd_protocol_ids};
567 my $attributes = $args->{attributes};
568 my $MAX_CHUNK_COUNT = 1000; # The max number of chunks to try to query at once, more than this returns an error
570 my $schema = $self->bcs_schema;
571 my $dbh = $schema->storage->dbh();
573 my %results = (
574 results => ()
577 # Check for required parameters
578 if ( !defined $feature_id || $feature_id eq '' ) {
579 $results{'error'} = "Feature ID not provided!";
580 return(\%results);
582 if ( (!defined $reference_genome || $reference_genome eq '') && (defined $start || defined $end) ) {
583 $results{'error'} = "Reference genome must be provided with start and/or end positions(s)!";
584 return(\%results);
587 # Set undefined start / end positions
588 if ( !defined $start || $start eq '' ) {
589 $start = 0;
591 if ( !defined $end || $end eq '' ) {
592 my $q = "SELECT MAX(end_pos) FROM public.featureprop_json WHERE feature_id = ?";
593 my $h = $dbh->prepare($q);
594 $h->execute($feature_id);
595 ($end) = $h->fetchrow_array();
598 # Set undefined type_ids / nd_protocol_ids (use class arguments, if available)
599 if ( !defined $type_ids ) {
600 $type_ids = $self->type_id ? [$self->type_id] : [];
602 if ( !defined $nd_protocol_ids ) {
603 $nd_protocol_ids = $self->nd_protocol_id ? [$self->nd_protocol_id] : [];
606 # Check if feature_id exists
607 my $feature_rs = $schema->resultset("Sequence::Feature")->find({ feature_id => $feature_id });
608 if ( !defined $feature_rs ) {
609 $results{'error'} = 'Could not find matching feature!';
610 return(\%results);
613 # Check if type_id exists and is valid type
614 foreach my $type_id (@$type_ids) {
615 my $cv_rs = $schema->resultset('Cv::Cv')->find({ name => "sequence_metadata_types" });
616 my $cvterm_rs = $schema->resultset('Cv::Cvterm')->find({ cvterm_id => $type_id, cv_id => $cv_rs->cv_id() });
617 if ( !defined $cvterm_rs ) {
618 $results{'error'} = "Could not find matching sequence metadata type [$type_id]!";
619 return(\%results);
623 # Check if nd_protocol_id exists and is valid type
624 foreach my $nd_protocol_id (@$nd_protocol_ids) {
626 # Check for existing sequence metadata protocol
627 my $protocol_cv_rs = $schema->resultset('Cv::Cv')->find({ name => "protocol_type" });
628 my $protocol_cvterm_rs = $schema->resultset('Cv::Cvterm')->find({ name => "sequence_metadata_protocol", cv_id => $protocol_cv_rs->cv_id() });
629 my $protocol_rs = $schema->resultset("NaturalDiversity::NdProtocol")->find({ nd_protocol_id => $nd_protocol_id, type_id => $protocol_cvterm_rs->cvterm_id() });
630 if ( !defined $protocol_rs ) {
631 $results{'error'} = "Could not find matching nd_protocol [$nd_protocol_id]!";
632 return(\%results);
637 # Get protocol external link definitions
638 my %external_links = ();
639 my $link_query = "SELECT nd_protocol_id, value->>'links' AS links
640 FROM nd_protocolprop
641 WHERE type_id = (SELECT cvterm_id FROM public.cvterm WHERE name = 'sequence_metadata_protocol_properties');";
642 my $lh = $dbh->prepare($link_query);
643 $lh->execute();
644 while (my ($nd_protocol_id, $link_json) = $lh->fetchrow_array()) {
645 my $links = defined $link_json ? decode_json $link_json : decode_json '{}';
646 $external_links{$nd_protocol_id} = $links;
649 # Build the base query conditions
650 my @query_params = ($feature_id, $start, $start, $end, $end, $start, $end);
651 my $query_where = " WHERE featureprop_json.feature_id = ? ";
652 $query_where .= " AND ((featureprop_json.start_pos <= ? AND featureprop_json.end_pos >= ?) OR (featureprop_json.start_pos <= ? AND featureprop_json.end_pos >= ?) OR (featureprop_json.start_pos >= ? AND featureprop_json.end_pos <= ?)) ";
653 if ( $type_ids && @$type_ids ) {
654 $query_where .= " AND featureprop_json.type_id IN (@{[join',', ('?') x @$type_ids]})";
655 push(@query_params, @$type_ids);
657 if ( $nd_protocol_ids && @$nd_protocol_ids ) {
658 $query_where .= " AND featureprop_json.nd_protocol_id IN (@{[join',', ('?') x @$nd_protocol_ids]})";
659 push(@query_params, @$nd_protocol_ids);
661 if ( defined $reference_genome && $reference_genome ne '' ) {
662 $query_where .= " AND featureprop_json.nd_protocol_id IN (SELECT nd_protocol_id FROM nd_protocolprop WHERE type_id = (SELECT cvterm_id FROM public.cvterm WHERE name='sequence_metadata_protocol_properties') AND value->>'reference_genome' = ?) ";
663 push(@query_params, $reference_genome);
666 # Estimate result size by getting number of matching chunks
667 my $size_query = "SELECT COUNT(featureprop_json_id) ";
668 $size_query .= "FROM public.featureprop_json";
669 $size_query .= $query_where;
670 my $sh = $dbh->prepare($size_query);
671 $sh->execute(@query_params);
672 my ($chunk_count) = $sh->fetchrow_array();
674 # DATA QUERY TOO LARGE: return with an error message
675 if ( $chunk_count > $MAX_CHUNK_COUNT ) {
676 $results{'error'} = "The requested query is too large to perform at once. Try filtering the data by including fewer data types/protocols and/or a smaller sequence range.";
677 return(\%results);
681 # Build query
682 my $query = "SELECT featureprop_json.featureprop_json_id, featureprop_json.feature_id, feature.name AS feature_name, (s->>'start')::int AS start, (s->>'end')::int AS end, featureprop_json.type_id, cvterm.name AS type_name, featureprop_json.nd_protocol_id, nd_protocol.name AS nd_protocol_name, s AS attributes
683 FROM featureprop_json
684 LEFT JOIN jsonb_array_elements(featureprop_json.json) as s(data) on true
685 LEFT JOIN public.feature ON feature.feature_id = featureprop_json.feature_id
686 LEFT JOIN public.cvterm ON cvterm.cvterm_id = featureprop_json.type_id
687 LEFT JOIN public.nd_protocol ON nd_protocol.nd_protocol_id = featureprop_json.nd_protocol_id";
689 # Add aditional conditions
690 $query .= $query_where;
691 $query .= " AND (
692 ((s->>'start')::int >= ? AND (s->>'end')::int <= ?)
693 OR ((s->>'start')::int >= ? AND (s->>'start')::int <= ? AND (s->>'end')::int > ?)
694 OR ((s->>'start')::int < ? AND (s->>'end')::int >= ? AND (s->>'end')::int <= ?)
695 OR ((s->>'start')::int < ? AND (s->>'end')::int > ?)
697 push(@query_params, $start, $end, $start, $end, $end, $start, $start, $end, $start, $end);
698 if ( $attributes && @$attributes ) {
699 $query .= " AND (";
700 my @aq = ();
701 foreach my $a (@$attributes ) {
702 my $ap = $a->{nd_protocol_id};
703 my $ak = $a->{key};
704 my $ac = $a->{comparison};
705 my $av = $a->{value};
707 my $cast = undef;
708 my $comp = undef;
709 if ( $ac eq "lt" ) {
710 $comp = "<";
711 $cast = "real";
713 elsif ( $ac eq "lte" ) {
714 $comp = "<=";
715 $cast = "real";
717 elsif ( $ac eq "gt" ) {
718 $comp = ">";
719 $cast = "real";
721 elsif ( $ac eq "gte" ) {
722 $comp = ">=";
723 $cast = "real";
725 elsif ( $ac eq "con" ) {
726 $comp = "LIKE";
727 $cast = "text";
728 $av = "%" . $av . "%";
730 else {
731 $comp = "=";
732 $cast = "text";
735 my $q = "(featureprop_json.nd_protocol_id <> ? OR (featureprop_json.nd_protocol_id = ?";
736 $q .= " AND CASE WHEN s->>? = '' THEN FALSE ELSE (s->>?)::$cast $comp ? END))";
738 push(@aq, $q);
739 push(@query_params, $ap, $ap, $ak, $ak, $av);
741 $query .= join(" AND ", @aq);
742 $query .= ") ";
744 $query .= " ORDER BY start ASC";
746 # print STDERR "QUERY:\n";
747 # print STDERR "$query\n";
748 # print STDERR "QUERY PARAMS:\n";
749 # use Data::Dumper;
750 # print STDERR Dumper \@query_params;
752 # Perform the search
753 my $h = $dbh->prepare($query);
754 $h->execute(@query_params);
756 # Parse the results
757 my @matches = ();
758 while (my ($featureprop_json_id, $feature_id, $feature_name, $start, $end, $type_id, $type_name, $nd_protocol_id, $nd_protocol_name, $attributes_json) = $h->fetchrow_array()) {
759 my $attributes = decode_json $attributes_json;
760 my $score = $attributes->{score};
761 delete $attributes->{score};
762 delete $attributes->{start};
763 delete $attributes->{end};
765 # Build Links, if provided and variables are defined
766 my %links = ();
767 my $defs = $external_links{$nd_protocol_id};
768 if ( defined $defs ) {
769 foreach my $title (keys %$defs) {
770 my $template = $defs->{$title};
771 my @variables = $template =~ /\{\{([^\}]*)\}\}/g;
773 # Get values for replacement
774 my $skip = 0; # flag to skip link if one or more variables are undefined for this match
775 my %values = (); # hash of variables and values to replace in the link
776 foreach my $variable (@variables) {
777 if ( $variable eq 'feature' ) {
778 $values{'feature'} = $feature_name;
780 elsif ( $variable eq 'start' ) {
781 $values{'start'} = $start;
783 elsif ( $variable eq 'end' ) {
784 $values{'end'} = $end;
786 else {
787 my $value = $attributes->{$variable};
788 if ( defined $value && $value ne '' ) {
789 $values{$variable} = $value;
791 else {
792 $skip = 1;
797 # Build link, replace variables with values
798 if ( !$skip ) {
799 my $link = $template;
800 foreach my $k (keys %values) {
801 my $v = $values{$k};
802 my $f = "\\{\\{$k\\}\\}";
803 $link =~ s/$f/$v/g;
805 $links{$title} = $link;
810 my %match = (
811 featureprop_json_id => $featureprop_json_id,
812 feature_id => $feature_id,
813 feature_name => $feature_name,
814 type_id => $type_id,
815 type_name => $type_name,
816 nd_protocol_id => $nd_protocol_id,
817 nd_protocol_name => $nd_protocol_name,
818 score => $score ne '' ? $score += 0 : '',
819 start => $start += 0,
820 end => $end += 0,
821 attributes => $attributes,
822 links => \%links
824 push(@matches, \%match);
827 # Return the results
828 $results{'results'} = \@matches;
829 return(\%results);