start fixing test for multi cat phenotype upload.
[sgn.git] / lib / CXGN / Genotype / Search.pm
blobd2074c7b65fb6305f2929c3a00601ab80f294be3
1 package CXGN::Genotype::Search;
3 =head1 NAME
5 CXGN::Genotype::Search - an object to handle searching genotypes for stocks
7 =head1 USAGE
9 PLEASE BE AWARE THAT THE DEFAULT OPTIONS FOR genotypeprop_hash_select, protocolprop_top_key_select, protocolprop_marker_hash_select ARE PRONE TO EXCEEDING THE MEMORY LIMITS OF VM. CHECK THE MOOSE ATTRIBUTES BELOW TO SEE THE DEFAULTS, AND ADJUST YOUR MOOSE INSTANTIATION ACCORDINGLY
11 my $genotypes_search = CXGN::Genotype::Search->new({
12 bcs_schema=>$schema,
13 people_schema=>$people_schema,
14 accession_list=>$accession_list,
15 tissue_sample_list=>$tissue_sample_list,
16 trial_list=>$trial_list,
17 protocol_id_list=>$protocol_id_list,
18 markerprofile_id_list=>$markerprofile_id_list,
19 genotype_data_project_list=>$genotype_data_project_list,
20 chromosome_list=>\@chromosome_numbers,
21 start_position=>$start_position,
22 end_position=>$end_position,
23 marker_name_list=>['S80_265728', 'S80_265723'],
24 genotypeprop_hash_select=>['DS', 'GT', 'DP'], #THESE ARE THE KEYS IN THE GENOTYPEPROP OBJECT
25 protocolprop_top_key_select=>['reference_genome_name', 'header_information_lines', 'marker_names', 'markers'], #THESE ARE THE KEYS AT THE TOP LEVEL OF THE PROTOCOLPROP OBJECT
26 protocolprop_marker_hash_select=>['name', 'chrom', 'pos', 'alt', 'ref'], #THESE ARE THE KEYS IN THE MARKERS OBJECT IN THE PROTOCOLPROP OBJECT
27 return_only_first_genotypeprop_for_stock=>0, #THIS IS TO CONSERVE MEMORY USAGE
28 limit=>$limit,
29 offset=>$offset,
30 forbid_cache=>$forbid_cache
31 # marker_search_hash_list=>[{'S80_265728' => {'pos' => '265728', 'chrom' => '1'}}], NOT IMPLEMENTED
32 # marker_score_search_hash_list=>[{'S80_265728' => {'GT' => '0/0', 'GQ' => '99'}}], NOT IMPLEMENTED
33 });
34 # my ($total_count, $genotypes) = $genotypes_search->get_genotype_info();
36 # RECOMMENDED
37 If you just want to get a file with the genotype result in a dosage matrix or VCF file, use get_cached_file_dosage_matrix or get_cached_file_VCF functions instead.
38 If you want results in json format use get_cached_file_search_json
39 If you want results in json format for only the metadata (no genotype call data), use get_cached_file_search_json()
41 =head1 DESCRIPTION
44 =head1 AUTHORS
46 Nicolas Morales <nm529@cornell.edu>
47 With code moved from CXGN::BreederSearch
48 Lukas Mueller <lam87@cornell.edu>
49 Aimin Yan <ay247@cornell.edu>
51 =cut
53 use strict;
54 use warnings;
55 use Moose;
56 use Try::Tiny;
57 use Data::Dumper;
58 use SGN::Model::Cvterm;
59 use CXGN::Trial;
60 use JSON;
61 use CXGN::Stock::Accession;
62 use CXGN::Genotype::Protocol;
63 use CXGN::Genotype::ComputeHybridGenotype;
64 use Cache::File;
65 use Digest::MD5 qw | md5_hex |;
66 use File::Slurp qw | write_file |;
67 use File::Temp qw | tempfile |;
68 use File::Copy;
69 use POSIX;
71 has 'bcs_schema' => (
72 isa => 'Bio::Chado::Schema',
73 is => 'rw',
74 required => 1
77 has 'people_schema' => (
78 isa => 'CXGN::People::Schema',
79 is => 'rw',
80 required => 1
83 has 'protocol_id_list' => (
84 isa => 'ArrayRef[Int]|Undef',
85 is => 'rw',
88 has 'markerprofile_id_list' => (
89 isa => 'ArrayRef[Int]|Undef',
90 is => 'ro',
93 has 'accession_list' => (
94 isa => 'ArrayRef[Int]|Undef',
95 is => 'ro',
98 has 'tissue_sample_list' => (
99 isa => 'ArrayRef[Int]|Undef',
100 is => 'ro',
103 has 'trial_list' => (
104 isa => 'ArrayRef[Int]|Undef',
105 is => 'ro',
108 has 'genotype_data_project_list' => (
109 isa => 'ArrayRef[Int]|Undef',
110 is => 'ro',
113 has 'chromosome_list' => (
114 isa => 'ArrayRef[Int]|ArrayRef[Str]|Undef',
115 is => 'ro',
118 has 'start_position' => (
119 isa => 'Int|Undef',
120 is => 'ro',
123 has 'end_position' => (
124 isa => 'Int|Undef',
125 is => 'ro',
128 has 'marker_name_list' => (
129 isa => 'ArrayRef[Str]|Undef',
130 is => 'ro',
133 has 'genotypeprop_hash_select' => (
134 isa => 'ArrayRef[Str]',
135 is => 'ro',
136 default => sub {['GT', 'AD', 'DP', 'GQ', 'DS', 'PL', 'NT']} #THESE ARE THE GENERIC AND EXPECTED VCF ATRRIBUTES
139 has 'protocolprop_top_key_select' => (
140 isa => 'ArrayRef[Str]',
141 is => 'ro',
142 default => sub {['reference_genome_name', 'species_name', 'header_information_lines', 'sample_observation_unit_type_name', 'marker_names', 'markers', 'markers_array']} #THESE ARE ALL POSSIBLE TOP LEVEL KEYS IN PROTOCOLPROP BASED ON VCF LOADING
145 has 'protocolprop_marker_hash_select' => (
146 isa => 'ArrayRef[Str]',
147 is => 'ro',
148 default => sub {['name', 'chrom', 'pos', 'alt', 'ref', 'qual', 'filter', 'info', 'format']} #THESE ARE ALL POSSIBLE PROTOCOLPROP MARKER HASH KEYS BASED ON VCF LOADING
151 has 'return_only_first_genotypeprop_for_stock' => (
152 isa => 'Bool',
153 is => 'ro',
154 default => 0
157 # When using the get_cached_file_dosage_matrix or get_cached_file_VCF functions, need the following three keys.
158 has 'cache_root' => (
159 isa => 'Str',
160 is => 'rw',
163 has 'cache' => (
164 isa => 'Cache::File',
165 is => 'rw',
168 has 'cache_expiry' => (
169 isa => 'Int',
170 is => 'rw',
171 default => 0, # never expires?
174 has 'forbid_cache' => (
175 isa => 'Bool',
176 is => 'ro',
177 default => 0
180 has 'prevent_transpose' => (
181 isa => 'Bool',
182 is => 'ro',
183 default => 0
186 has '_iterator_query_handle' => (
187 isa => 'Ref',
188 is => 'rw'
191 has '_iterator_genotypeprop_query_handle' => (
192 isa => 'Ref',
193 is => 'rw'
196 has '_filtered_markers' => (
197 isa => 'HashRef',
198 is => 'rw',
199 default => sub {{}}
202 has '_vcf_genotyping_cvterm_id' => (
203 isa => 'Int',
204 is => 'rw'
207 has '_vcf_map_details_cvterm_id' => (
208 isa => 'Int',
209 is => 'rw'
212 has '_vcf_map_details_markers_cvterm_id' => (
213 isa => 'Int',
214 is => 'rw'
217 has '_vcf_map_details_markers_array_cvterm_id' => (
218 isa => 'Int',
219 is => 'rw'
222 has '_igd_genotypeprop_cvterm_id' => (
223 isa => 'Int',
224 is => 'rw'
227 has '_accession_cvterm_id' => (
228 isa => 'Int',
229 is => 'rw'
232 has '_tissue_sample_cvterm_id' => (
233 isa => 'Int',
234 is => 'rw'
237 has '_plot_cvterm_id' => (
238 isa => 'Int',
239 is => 'rw'
242 has '_plant_cvterm_id' => (
243 isa => 'Int',
244 is => 'rw'
247 has '_tissue_sample_of_cvterm_id' => (
248 isa => 'Int',
249 is => 'rw'
252 has '_protocolprop_markers_h' => (
253 isa => 'Ref',
254 is => 'rw'
257 has '_protocolprop_top_key_h' => (
258 isa => 'Ref',
259 is => 'rw'
262 has '_protocolprop_top_key_markers_h' => (
263 isa => 'Ref',
264 is => 'rw'
267 has '_protocolprop_top_key_markers_array_h' => (
268 isa => 'Ref',
269 is => 'rw'
272 has '_genotypeprop_h' => (
273 isa => 'Ref',
274 is => 'rw'
277 has '_protocolprop_marker_hash_select_arr' => (
278 isa => 'ArrayRef',
279 is => 'rw'
282 has '_protocolprop_top_key_select_arr' => (
283 isa => 'ArrayRef',
284 is => 'rw'
287 has '_selected_protocol_marker_info' => (
288 isa => 'Ref',
289 is => 'rw'
292 has '_selected_protocol_top_key_info' => (
293 isa => 'Ref',
294 is => 'rw'
297 has '_genotypeprop_infos' => (
298 isa => 'ArrayRef',
299 is => 'rw'
302 has '_genotypeprop_infos_counter' => (
303 isa => 'Int',
304 is => 'rw'
307 has '_genotypeprop_hash_select_arr' => (
308 isa => 'ArrayRef',
309 is => 'rw'
312 #NOT IMPLEMENTED
313 has 'marker_search_hash_list' => (
314 isa => 'ArrayRef[HashRef]|Undef',
315 is => 'ro',
318 #NOT IMPLEMENTED
319 has 'marker_score_search_hash_list' => (
320 isa => 'ArrayRef[HashRef]|Undef',
321 is => 'ro',
324 has 'limit' => (
325 isa => 'Int|Undef',
326 is => 'rw',
329 has 'offset' => (
330 isa => 'Int|Undef',
331 is => 'rw',
334 =head2 get_genotype_info
336 returns: an array with genotype information
338 =cut
340 =head2 get_genotype_info()
342 Function for getting genotype data iteratively.
343 Should be used like:
345 my $genotype_search = CXGN::Genotype::Search->new({
346 bcs_schema=>$schema,
347 etc...
349 my ($count, $genotype_data) = $genotype_search->get_genotype_info();
351 If you want to get results iteratively, use the init and iterator function defined below instead. Iterative retrieval minimizes memory load.
353 If you just want to get a file with the genotype result in a dosage matrix or VCF file, use get_cached_file_dosage_matrix or get_cached_file_VCF functions instead.
354 If you want results in json format use get_cached_file_search_json
355 If you want results in json format for only the metadata (no genotype call data), use get_cached_file_search_json()
357 =cut
359 sub get_genotype_info {
360 my $self = shift;
361 my $schema = $self->bcs_schema;
362 my $trial_list = $self->trial_list;
363 my $genotype_data_project_list = $self->genotype_data_project_list;
364 my $protocol_id_list = $self->protocol_id_list;
365 my $markerprofile_id_list = $self->markerprofile_id_list;
366 my $accession_list = $self->accession_list;
367 my $tissue_sample_list = $self->tissue_sample_list;
368 my $marker_name_list = $self->marker_name_list;
369 my $chromosome_list = $self->chromosome_list;
370 my $start_position = $self->start_position;
371 my $end_position = $self->end_position;
372 my $genotypeprop_hash_select = $self->genotypeprop_hash_select;
373 my $protocolprop_top_key_select = $self->protocolprop_top_key_select;
374 my $protocolprop_marker_hash_select = $self->protocolprop_marker_hash_select;
375 my $marker_search_hash_list = $self->marker_search_hash_list;
376 my $marker_score_search_hash_list = $self->marker_score_search_hash_list;
377 my $return_only_first_genotypeprop_for_stock = $self->return_only_first_genotypeprop_for_stock;
378 my $limit = $self->limit;
379 my $offset = $self->offset;
380 my @data;
381 my %search_params;
382 my @where_clause;
384 my $vcf_genotyping_cvterm_id = $self->search_vcf_genotyping_cvterm_id({protocol_id => $protocol_id_list->[0],genotype_id => $markerprofile_id_list->[0]});
385 my $vcf_genotyping_cvterm_id = $self->search_vcf_genotyping_cvterm_id();
387 my $vcf_map_details_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'vcf_map_details', 'protocol_property')->cvterm_id();
388 my $vcf_map_details_markers_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'vcf_map_details_markers', 'protocol_property')->cvterm_id();
389 my $vcf_map_details_markers_array_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'vcf_map_details_markers_array', 'protocol_property')->cvterm_id();
390 my $igd_genotypeprop_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'igd number', 'genotype_property')->cvterm_id();
391 my $accession_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'accession', 'stock_type')->cvterm_id();
392 my $tissue_sample_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'tissue_sample', 'stock_type')->cvterm_id();
393 my $plot_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'plot', 'stock_type')->cvterm_id();
394 my $plant_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'plant', 'stock_type')->cvterm_id();
395 my $tissue_sample_of_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'tissue_sample_of', 'stock_relationship')->cvterm_id();
397 my @trials_accessions;
398 foreach (@$trial_list){
399 my $trial = CXGN::Trial->new({bcs_schema=>$schema, trial_id=>$_});
400 my $accessions = $trial->get_accessions();
401 foreach (@$accessions){
402 push @trials_accessions, $_->{stock_id};
406 #If accessions are explicitly given, then accessions found from trials will not be added to the search.
407 if (!$accession_list || scalar(@$accession_list)==0) {
408 push @$accession_list, @trials_accessions;
411 #For projects inserted into database during the addition of genotypes and genotypeprops
412 if (scalar(@trials_accessions)==0){
413 if ($trial_list && scalar(@$trial_list)>0) {
414 my $trial_sql = join ("," , @$trial_list);
415 push @where_clause, "project.project_id in ($trial_sql)";
419 #For genotyping_data_project
420 if ($genotype_data_project_list && scalar(@$genotype_data_project_list)>0) {
421 my $sql = join ("," , @$genotype_data_project_list);
422 push @where_clause, "project.project_id in ($sql)";
424 if ($protocol_id_list && scalar(@$protocol_id_list)>0) {
425 my $protocol_sql = join ("," , @$protocol_id_list);
426 push @where_clause, "nd_protocol.nd_protocol_id in ($protocol_sql)";
428 if ($accession_list && scalar(@$accession_list)>0) {
429 my $accession_sql = join ("," , @$accession_list);
430 push @where_clause, " ( stock.stock_id in ($accession_sql) OR (accession_of_tissue_sample.stock_id in ($accession_sql) AND accession_of_tissue_sample.type_id = $accession_cvterm_id) ) ";
431 push @where_clause, "stock.type_id in ($accession_cvterm_id, $tissue_sample_cvterm_id)";
433 if ($tissue_sample_list && scalar(@$tissue_sample_list)>0) {
434 my $stock_sql = join ("," , @$tissue_sample_list);
435 push @where_clause, "stock.stock_id in ($stock_sql)";
436 push @where_clause, "stock.type_id = $tissue_sample_cvterm_id";
438 if ($markerprofile_id_list && scalar(@$markerprofile_id_list)>0) {
439 my $markerprofile_sql = join ("," , @$markerprofile_id_list);
440 push @where_clause, "genotype.genotype_id in ($markerprofile_sql)";
442 if ($marker_name_list && scalar(@$marker_name_list)>0) {
443 my $search_vals_sql = "'".join ("','" , @$marker_name_list)."'";
444 push @where_clause, "nd_protocolprop.value->'marker_names' \\?& array[$search_vals_sql]";
446 if ($marker_search_hash_list && scalar(@$marker_search_hash_list)>0) {
447 foreach (@$marker_search_hash_list){
448 my $json_val = encode_json $_;
449 push @where_clause, "nd_protocolprop.value->'markers' \\@> $json_val"."::jsonb";
452 if ($marker_score_search_hash_list && scalar(@$marker_score_search_hash_list)>0) {
453 foreach (@$marker_score_search_hash_list){
454 my $json_val = encode_json $_;
455 push @where_clause, "genotype_values.value \\@> $json_val"."::jsonb";
459 my $where_clause = scalar(@where_clause)>0 ? " WHERE " . (join (" AND " , @where_clause)) : '';
461 my $offset_clause = '';
462 my $limit_clause = '';
463 if ($limit){
464 $limit_clause = " LIMIT $limit ";
466 if ($offset){
467 $offset_clause = " OFFSET $offset ";
470 my $stock_select = '';
471 if ($return_only_first_genotypeprop_for_stock) {
472 $stock_select = 'distinct on (stock.stock_id) stock.stock_id';
473 } else {
474 $stock_select = 'stock.stock_id';
477 my $q = "SELECT $stock_select, igd_number_genotypeprop.value, nd_protocol.nd_protocol_id, nd_protocol.name, stock.uniquename, stock.type_id, stock_cvterm.name, genotype.genotype_id, genotype.uniquename, genotype.description, project.project_id, project.name, project.description, accession_of_tissue_sample.stock_id, accession_of_tissue_sample.uniquename, count(genotype.genotype_id) OVER() AS full_count
478 FROM stock
479 JOIN cvterm AS stock_cvterm ON(stock.type_id = stock_cvterm.cvterm_id)
480 LEFT JOIN stock_relationship ON(stock_relationship.subject_id=stock.stock_id AND stock_relationship.type_id = $tissue_sample_of_cvterm_id)
481 LEFT JOIN stock AS accession_of_tissue_sample ON(stock_relationship.object_id=accession_of_tissue_sample.stock_id)
482 JOIN nd_experiment_stock ON(stock.stock_id=nd_experiment_stock.stock_id)
483 JOIN nd_experiment USING(nd_experiment_id)
484 JOIN nd_experiment_protocol USING(nd_experiment_id)
485 JOIN nd_experiment_project USING(nd_experiment_id)
486 JOIN nd_experiment_genotype USING(nd_experiment_id)
487 JOIN nd_protocol USING(nd_protocol_id)
488 LEFT JOIN nd_protocolprop ON(nd_protocolprop.nd_protocol_id = nd_protocol.nd_protocol_id AND nd_protocolprop.type_id = $vcf_map_details_cvterm_id)
489 JOIN genotype USING(genotype_id)
490 LEFT JOIN genotypeprop AS igd_number_genotypeprop ON(igd_number_genotypeprop.genotype_id = genotype.genotype_id AND igd_number_genotypeprop.type_id = $igd_genotypeprop_cvterm_id)
491 JOIN project USING(project_id)
492 $where_clause
493 ORDER BY stock.stock_id, genotype.genotype_id ASC
494 $limit_clause
495 $offset_clause;";
497 #print STDERR Dumper $q;
498 my $h = $schema->storage->dbh()->prepare($q);
499 $h->execute();
501 my $total_count = 0;
502 my @genotype_id_array;
503 my @genotypeprop_array;
504 my %genotype_hash;
505 my %genotypeprop_hash;
506 my %protocolprop_hash;
507 while (my ($stock_id, $igd_number_json, $protocol_id, $protocol_name, $stock_name, $stock_type_id, $stock_type_name, $genotype_id, $genotype_uniquename, $genotype_description, $project_id, $project_name, $project_description, $accession_id, $accession_uniquename, $full_count) = $h->fetchrow_array()) {
508 my $igd_number_hash = $igd_number_json ? decode_json $igd_number_json : undef;
509 my $igd_number = $igd_number_hash ? $igd_number_hash->{'igd number'} : undef;
510 $igd_number = !$igd_number && $igd_number_hash ? $igd_number_hash->{'igd_number'} : undef;
512 my $germplasmName = '';
513 my $germplasmDbId = '';
514 if ($stock_type_name eq 'accession'){
515 $germplasmName = $stock_name;
516 $germplasmDbId = $stock_id;
518 if ($stock_type_name eq 'tissue_sample'){
519 $germplasmName = $accession_uniquename;
520 $germplasmDbId = $accession_id;
523 my $stock_object = CXGN::Stock::Accession->new({schema=>$self->bcs_schema, stock_id=>$germplasmDbId});
525 push @genotype_id_array, $genotype_id;
527 $genotype_hash{$genotype_id} = {
528 markerProfileDbId => $genotype_id,
529 germplasmDbId => $germplasmDbId,
530 germplasmName => $germplasmName,
531 synonyms => $stock_object->synonyms,
532 stock_id => $stock_id,
533 stock_name => $stock_name,
534 stock_type_id => $stock_type_id,
535 stock_type_name => $stock_type_name,
536 genotypeDbId => $genotype_id,
537 genotypeUniquename => $genotype_uniquename,
538 genotypeDescription => $genotype_description,
539 analysisMethodDbId => $protocol_id,
540 analysisMethod => $protocol_name,
541 genotypingDataProjectDbId => $project_id,
542 genotypingDataProjectName => $project_name,
543 genotypingDataProjectDescription => $project_description,
544 igd_number => $igd_number,
546 $protocolprop_hash{$protocol_id}++;
547 $total_count = $full_count;
550 my @found_protocolprop_ids = keys %protocolprop_hash;
552 my @protocolprop_marker_hash_select_arr;
553 foreach (@$protocolprop_marker_hash_select){
554 push @protocolprop_marker_hash_select_arr, "s.value->>'$_'";
556 my @protocolprop_top_key_select_arr;
557 my %protocolprop_top_key_select_hash;
558 foreach (@$protocolprop_top_key_select){
559 if ($_ ne 'markers' && $_ ne 'markers_array') {
560 push @protocolprop_top_key_select_arr, "value->>'$_'";
562 $protocolprop_top_key_select_hash{$_}++;
564 my %selected_protocol_marker_info;
565 my %selected_protocol_top_key_info;
566 my %filtered_markers;
567 my $genotypeprop_chromosome_rank_string = '';
568 if (scalar(@found_protocolprop_ids)>0){
569 my $protocolprop_id_sql = join ("," , @found_protocolprop_ids);
570 my $protocolprop_where_sql = "nd_protocol_id in ($protocolprop_id_sql) and type_id = $vcf_map_details_cvterm_id";
571 my $protocolprop_where_markers_sql = "nd_protocol_id in ($protocolprop_id_sql) and type_id = $vcf_map_details_markers_cvterm_id";
572 my $protocolprop_where_markers_array_sql = "nd_protocol_id in ($protocolprop_id_sql) and type_id = $vcf_map_details_markers_array_cvterm_id";
573 my $protocolprop_hash_select_sql = scalar(@protocolprop_marker_hash_select_arr) > 0 ? ', '.join ',', @protocolprop_marker_hash_select_arr : '';
575 my $chromosome_where = '';
576 if ($chromosome_list && scalar(@$chromosome_list)>0) {
577 my $chromosome_list_sql = '\'' . join('\', \'', @$chromosome_list) . '\'';
578 $chromosome_where = " AND (s.value->>'chrom')::text IN ($chromosome_list_sql)";
579 #$genotypeprop_chromosome_rank_string = " AND value->>'CHROM' IN ($chromosome_list_sql) ";
581 my $start_position_where = '';
582 if (defined($start_position)) {
583 $start_position_where = " AND (s.value->>'pos')::int >= $start_position";
585 my $end_position_where = '';
586 if (defined($end_position)) {
587 $end_position_where = " AND (s.value->>'pos')::int <= $end_position";
590 my $protocolprop_q = "SELECT nd_protocol_id, s.key $protocolprop_hash_select_sql
591 FROM nd_protocolprop, jsonb_each(nd_protocolprop.value) as s
592 WHERE $protocolprop_where_markers_sql $chromosome_where $start_position_where $end_position_where;";
594 my $protocolprop_h = $schema->storage->dbh()->prepare($protocolprop_q);
595 $protocolprop_h->execute();
596 while (my ($protocol_id, $marker_name, @protocolprop_info_return) = $protocolprop_h->fetchrow_array()) {
597 for my $s (0 .. scalar(@protocolprop_marker_hash_select_arr)-1){
598 $selected_protocol_marker_info{$protocol_id}->{$marker_name}->{$protocolprop_marker_hash_select->[$s]} = $protocolprop_info_return[$s];
600 $filtered_markers{$marker_name}++;
602 my $protocolprop_top_key_select_sql = scalar(@protocolprop_top_key_select_arr) > 0 ? ', '.join ',', @protocolprop_top_key_select_arr : '';
603 my $protocolprop_top_key_q = "SELECT nd_protocol_id $protocolprop_top_key_select_sql from nd_protocolprop WHERE $protocolprop_where_sql;";
604 my $protocolprop_top_key_h = $schema->storage->dbh()->prepare($protocolprop_top_key_q);
605 $protocolprop_top_key_h->execute();
606 while (my ($protocol_id, @protocolprop_top_key_return) = $protocolprop_top_key_h->fetchrow_array()) {
607 for my $s (0 .. scalar(@protocolprop_top_key_select_arr)-1){
608 my $protocolprop_i = $protocolprop_top_key_select->[$s];
609 my $val;
610 if ($protocolprop_i eq 'header_information_lines' || $protocolprop_i eq 'marker_names') {
611 $val = decode_json $protocolprop_top_key_return[$s];
612 } else {
613 $val = $protocolprop_top_key_return[$s];
615 $selected_protocol_top_key_info{$protocol_id}->{$protocolprop_i} = $val;
618 if (exists($protocolprop_top_key_select_hash{'markers'})) {
619 my $protocolprop_top_key_q = "SELECT nd_protocol_id, value from nd_protocolprop WHERE $protocolprop_where_markers_sql;";
620 my $protocolprop_top_key_h = $schema->storage->dbh()->prepare($protocolprop_top_key_q);
621 $protocolprop_top_key_h->execute();
622 while (my ($protocol_id, $markers_value) = $protocolprop_top_key_h->fetchrow_array()) {
623 $selected_protocol_top_key_info{$protocol_id}->{'markers'} = decode_json $markers_value;
626 if (exists($protocolprop_top_key_select_hash{'markers_array'})) {
627 my $protocolprop_top_key_q = "SELECT nd_protocol_id, value from nd_protocolprop WHERE $protocolprop_where_markers_array_sql;";
628 my $protocolprop_top_key_h = $schema->storage->dbh()->prepare($protocolprop_top_key_q);
629 $protocolprop_top_key_h->execute();
630 while (my ($protocol_id, $markers_value) = $protocolprop_top_key_h->fetchrow_array()) {
631 $selected_protocol_top_key_info{$protocol_id}->{'markers_array'} = decode_json $markers_value;
636 my @genotypeprop_hash_select_arr;
637 foreach (@$genotypeprop_hash_select){
638 push @genotypeprop_hash_select_arr, "s.value->>'$_'";
640 if (scalar(@genotype_id_array)>0) {
641 my $genotypeprop_id_sql = join ("," , @genotype_id_array);
642 my $genotypeprop_hash_select_sql = scalar(@genotypeprop_hash_select_arr) > 0 ? ', '.join ',', @genotypeprop_hash_select_arr : '';
644 my $filtered_markers_sql = '';
645 if (scalar(keys %filtered_markers) >0 && scalar(keys %filtered_markers) < 10000) {
646 $filtered_markers_sql = " AND s.key IN ('". join ("','", keys %filtered_markers) ."')";
649 my $q2 = "SELECT genotypeprop_id
650 FROM genotypeprop WHERE genotype_id = ? AND type_id=$vcf_genotyping_cvterm_id $genotypeprop_chromosome_rank_string;";
651 my $h2 = $schema->storage->dbh()->prepare($q2);
653 my $genotypeprop_q = "SELECT s.key $genotypeprop_hash_select_sql
654 FROM genotypeprop, jsonb_each(genotypeprop.value) as s
655 WHERE genotypeprop_id = ? AND s.key != 'CHROM' AND type_id = $vcf_genotyping_cvterm_id $filtered_markers_sql;";
656 my $genotypeprop_h = $schema->storage->dbh()->prepare($genotypeprop_q);
658 foreach my $genotype_id (@genotype_id_array){
659 $h2->execute($genotype_id);
660 while (my ($genotypeprop_id) = $h2->fetchrow_array()) {
661 $genotypeprop_h->execute($genotypeprop_id);
662 while (my ($marker_name, @genotypeprop_info_return) = $genotypeprop_h->fetchrow_array()) {
663 for my $s (0 .. scalar(@genotypeprop_hash_select_arr)-1){
664 $genotype_hash{$genotype_id}->{selected_genotype_hash}->{$marker_name}->{$genotypeprop_hash_select->[$s]} = $genotypeprop_info_return[$s];
671 foreach (@genotype_id_array) {
672 my $info = $genotype_hash{$_};
673 my $selected_marker_info = $selected_protocol_marker_info{$info->{analysisMethodDbId}} ? $selected_protocol_marker_info{$info->{analysisMethodDbId}} : {};
674 my $selected_protocol_info = $selected_protocol_top_key_info{$info->{analysisMethodDbId}} ? $selected_protocol_top_key_info{$info->{analysisMethodDbId}} : {};
675 my @all_protocol_marker_names = keys %$selected_marker_info;
676 $selected_protocol_info->{markers} = $selected_marker_info;
677 $info->{resultCount} = scalar(keys %{$info->{selected_genotype_hash}});
678 $info->{all_protocol_marker_names} = \@all_protocol_marker_names;
679 $info->{selected_protocol_hash} = $selected_protocol_info;
680 push @data, $info;
683 return ($total_count, \@data);
686 =head2 init_genotype_iterator()
688 Function for initiating genotype search query and then used to get genotype data iteratively. Iterative search retrieval minimizes memory usage.
689 Should be used like:
691 my $genotype_search = CXGN::Genotype::Search->new({
692 bcs_schema=>$schema,
693 etc...
695 $genotype_search->init_genotype_iterator();
696 while (my ($count, $genotype_data) = $genotype_search->get_next_genotype_info) {
697 #Do something with genotype data
700 If you just want to get a file with the genotype result in a dosage matrix or VCF file, use get_cached_file_dosage_matrix or get_cached_file_VCF functions instead.
701 If you want results in json format use get_cached_file_search_json
702 If you want results in json format for only the metadata (no genotype call data), use get_cached_file_search_json()
704 =cut
706 sub init_genotype_iterator {
707 my $self = shift;
708 my $schema = $self->bcs_schema;
709 my $trial_list = $self->trial_list;
710 my $genotype_data_project_list = $self->genotype_data_project_list;
711 my $protocol_id_list = $self->protocol_id_list;
712 my $markerprofile_id_list = $self->markerprofile_id_list;
713 my $accession_list = $self->accession_list;
714 my $tissue_sample_list = $self->tissue_sample_list;
715 my $marker_name_list = $self->marker_name_list;
716 my $chromosome_list = $self->chromosome_list;
717 my $start_position = $self->start_position;
718 my $end_position = $self->end_position;
719 my $genotypeprop_hash_select = $self->genotypeprop_hash_select;
720 my $protocolprop_top_key_select = $self->protocolprop_top_key_select;
721 my $protocolprop_marker_hash_select = $self->protocolprop_marker_hash_select;
722 my $marker_search_hash_list = $self->marker_search_hash_list;
723 my $marker_score_search_hash_list = $self->marker_score_search_hash_list;
724 my $return_only_first_genotypeprop_for_stock = $self->return_only_first_genotypeprop_for_stock;
725 my $limit = $self->limit;
726 my $offset = $self->offset;
727 my @data;
728 my %search_params;
729 my @where_clause;
731 my $vcf_genotyping_cvterm_id = $self->search_vcf_genotyping_cvterm_id({protocol_id => $protocol_id_list->[0],genotype_id => $markerprofile_id_list->[0]});
732 $self->_vcf_genotyping_cvterm_id($vcf_genotyping_cvterm_id);
734 my $vcf_map_details_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'vcf_map_details', 'protocol_property')->cvterm_id();
735 $self->_vcf_map_details_cvterm_id($vcf_map_details_cvterm_id);
736 my $vcf_map_details_markers_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'vcf_map_details_markers', 'protocol_property')->cvterm_id();
737 $self->_vcf_map_details_markers_cvterm_id($vcf_map_details_markers_cvterm_id);
738 my $vcf_map_details_markers_array_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'vcf_map_details_markers_array', 'protocol_property')->cvterm_id();
739 $self->_vcf_map_details_markers_array_cvterm_id($vcf_map_details_markers_array_cvterm_id);
740 my $igd_genotypeprop_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'igd number', 'genotype_property')->cvterm_id();
741 $self->_igd_genotypeprop_cvterm_id($igd_genotypeprop_cvterm_id);
742 my $accession_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'accession', 'stock_type')->cvterm_id();
743 $self->_accession_cvterm_id($accession_cvterm_id);
744 my $tissue_sample_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'tissue_sample', 'stock_type')->cvterm_id();
745 $self->_tissue_sample_cvterm_id($tissue_sample_cvterm_id);
746 my $plot_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'plot', 'stock_type')->cvterm_id();
747 $self->_plot_cvterm_id($plot_cvterm_id);
748 my $plant_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'plant', 'stock_type')->cvterm_id();
749 $self->_plant_cvterm_id($plant_cvterm_id);
750 my $tissue_sample_of_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'tissue_sample_of', 'stock_relationship')->cvterm_id();
751 $self->_tissue_sample_of_cvterm_id($tissue_sample_of_cvterm_id);
753 my @trials_accessions;
754 foreach (@$trial_list){
755 my $trial = CXGN::Trial->new({bcs_schema=>$schema, trial_id=>$_});
756 my $accessions = $trial->get_accessions();
757 foreach (@$accessions){
758 push @trials_accessions, $_->{stock_id};
762 #If accessions are explicitly given, then accessions found from trials will not be added to the search.
763 if (!$accession_list || scalar(@$accession_list)==0) {
764 push @$accession_list, @trials_accessions;
767 #For projects inserted into database during the addition of genotypes and genotypeprops
768 if (scalar(@trials_accessions)==0){
769 if ($trial_list && scalar(@$trial_list)>0) {
770 my $trial_sql = join ("," , @$trial_list);
771 push @where_clause, "project.project_id in ($trial_sql)";
775 #For genotyping_data_project
776 if ($genotype_data_project_list && scalar(@$genotype_data_project_list)>0) {
777 my $sql = join ("," , @$genotype_data_project_list);
778 push @where_clause, "project.project_id in ($sql)";
780 if ($protocol_id_list && scalar(@$protocol_id_list)>0) {
781 my $protocol_sql = join ("," , @$protocol_id_list);
782 push @where_clause, "nd_protocol.nd_protocol_id in ($protocol_sql)";
784 if ($accession_list && scalar(@$accession_list)>0) {
785 my $accession_sql = join ("," , @$accession_list);
786 push @where_clause, " ( stock.stock_id in ($accession_sql) OR (accession_of_tissue_sample.stock_id in ($accession_sql) AND accession_of_tissue_sample.type_id = $accession_cvterm_id) ) ";
787 push @where_clause, "stock.type_id in ($accession_cvterm_id, $tissue_sample_cvterm_id)";
789 if ($tissue_sample_list && scalar(@$tissue_sample_list)>0) {
790 my $stock_sql = join ("," , @$tissue_sample_list);
791 push @where_clause, "stock.stock_id in ($stock_sql)";
792 push @where_clause, "stock.type_id = $tissue_sample_cvterm_id";
794 if ($markerprofile_id_list && scalar(@$markerprofile_id_list)>0) {
795 my $markerprofile_sql = join ("," , @$markerprofile_id_list);
796 push @where_clause, "genotype.genotype_id in ($markerprofile_sql)";
798 my %filtered_markers;
799 if ($marker_name_list && scalar(@$marker_name_list)>0) {
800 my $search_vals_sql = "'".join ("','" , @$marker_name_list)."'";
801 push @where_clause, "nd_protocolprop.value->'marker_names' \\?& array[$search_vals_sql]";
803 foreach (@$marker_name_list) {
804 $filtered_markers{$_}++;
807 if ($marker_search_hash_list && scalar(@$marker_search_hash_list)>0) {
808 foreach (@$marker_search_hash_list){
809 my $json_val = encode_json $_;
810 push @where_clause, "nd_protocolprop.value->'markers' \\@> $json_val"."::jsonb";
813 if ($marker_score_search_hash_list && scalar(@$marker_score_search_hash_list)>0) {
814 foreach (@$marker_score_search_hash_list){
815 my $json_val = encode_json $_;
816 push @where_clause, "genotype_values.value \\@> $json_val"."::jsonb";
820 my $where_clause = scalar(@where_clause)>0 ? " WHERE " . (join (" AND " , @where_clause)) : '';
822 my $offset_clause = '';
823 my $limit_clause = '';
824 if ($limit){
825 $limit_clause = " LIMIT $limit ";
827 if ($offset){
828 $offset_clause = " OFFSET $offset ";
831 my $stock_select = '';
832 if ($return_only_first_genotypeprop_for_stock) {
833 $stock_select = 'distinct on (stock.stock_id) stock.stock_id';
834 } else {
835 $stock_select = 'stock.stock_id';
838 # Setup protocolprop query handles
839 my @protocolprop_marker_hash_select_arr;
840 foreach (@$protocolprop_marker_hash_select){
841 push @protocolprop_marker_hash_select_arr, "s.value->>'$_'";
843 $self->_protocolprop_marker_hash_select_arr(\@protocolprop_marker_hash_select_arr);
845 my @protocolprop_top_key_select_arr;
846 foreach (@$protocolprop_top_key_select){
847 if ($_ ne 'markers' && $_ ne 'markers_array') {
848 push @protocolprop_top_key_select_arr, "value->>'$_'";
851 $self->_protocolprop_top_key_select_arr(\@protocolprop_top_key_select_arr);
853 my $protocolprop_hash_select_sql = scalar(@protocolprop_marker_hash_select_arr) > 0 ? ', '.join ',', @protocolprop_marker_hash_select_arr : '';
855 my $chromosome_where = '';
856 my $genotypeprop_chromosome_rank_string = '';
857 if ($chromosome_list && scalar(@$chromosome_list)>0) {
858 my $chromosome_list_sql = '\'' . join('\', \'', @$chromosome_list) . '\'';
859 $chromosome_where = " AND (s.value->>'chrom')::text IN ($chromosome_list_sql)";
860 #$genotypeprop_chromosome_rank_string = " AND value->>'CHROM' IN ($chromosome_list_sql) ";
862 my $start_position_where = '';
863 if (defined($start_position)) {
864 $start_position_where = " AND (s.value->>'pos')::int >= $start_position";
866 my $end_position_where = '';
867 if (defined($end_position)) {
868 $end_position_where = " AND (s.value->>'pos')::int <= $end_position";
870 my $marker_name_list_where = '';
871 if ($marker_name_list && scalar(@$marker_name_list)>0) {
872 my $search_vals_sql = '\''.join ('\', \'' , @$marker_name_list).'\'';
873 $marker_name_list_where = "AND (s.value->>'name')::text IN ($search_vals_sql)";
876 my $protocolprop_q = "SELECT nd_protocol_id, s.key $protocolprop_hash_select_sql
877 FROM nd_protocolprop, jsonb_each(nd_protocolprop.value) as s
878 WHERE nd_protocol_id = ? AND type_id = $vcf_map_details_markers_cvterm_id $chromosome_where $start_position_where $end_position_where $marker_name_list_where;";
879 #print STDERR Dumper $protocolprop_q;
880 my $protocolprop_h = $schema->storage->dbh()->prepare($protocolprop_q);
881 $self->_protocolprop_markers_h($protocolprop_h);
883 my $protocolprop_top_key_select_sql = scalar(@protocolprop_top_key_select_arr) > 0 ? ', '.join ',', @protocolprop_top_key_select_arr : '';
884 my $protocolprop_top_key_q = "SELECT nd_protocol_id $protocolprop_top_key_select_sql from nd_protocolprop WHERE nd_protocol_id = ? AND type_id = $vcf_map_details_cvterm_id;";
885 my $protocolprop_top_key_h = $schema->storage->dbh()->prepare($protocolprop_top_key_q);
886 $self->_protocolprop_top_key_h($protocolprop_top_key_h);
888 my $protocolprop_top_key_markers_q = "SELECT nd_protocol_id, value from nd_protocolprop WHERE nd_protocol_id = ? AND type_id = $vcf_map_details_markers_cvterm_id;";
889 my $protocolprop_top_key_markers_h = $schema->storage->dbh()->prepare($protocolprop_top_key_markers_q);
890 $self->_protocolprop_top_key_markers_h($protocolprop_top_key_markers_h);
892 my $protocolprop_top_key_markers_array_q = "SELECT nd_protocol_id, value from nd_protocolprop WHERE nd_protocol_id = ? AND type_id = $vcf_map_details_markers_array_cvterm_id;";
893 my $protocolprop_top_key_markers_array_h = $schema->storage->dbh()->prepare($protocolprop_top_key_markers_array_q);
894 $self->_protocolprop_top_key_markers_array_h($protocolprop_top_key_markers_array_h);
896 my $q = "SELECT $stock_select, igd_number_genotypeprop.value, nd_protocol.nd_protocol_id, nd_protocol.name, stock.uniquename, stock.type_id, stock_cvterm.name, genotype.genotype_id, genotype.uniquename, genotype.description, project.project_id, project.name, project.description, accession_of_tissue_sample.stock_id, accession_of_tissue_sample.uniquename, count(genotype.genotype_id) OVER() AS full_count
897 FROM stock
898 JOIN cvterm AS stock_cvterm ON(stock.type_id = stock_cvterm.cvterm_id)
899 LEFT JOIN stock_relationship ON(stock_relationship.subject_id=stock.stock_id AND stock_relationship.type_id = $tissue_sample_of_cvterm_id)
900 LEFT JOIN stock AS accession_of_tissue_sample ON(stock_relationship.object_id=accession_of_tissue_sample.stock_id)
901 JOIN nd_experiment_stock ON(stock.stock_id=nd_experiment_stock.stock_id)
902 JOIN nd_experiment USING(nd_experiment_id)
903 JOIN nd_experiment_protocol USING(nd_experiment_id)
904 JOIN nd_experiment_project USING(nd_experiment_id)
905 JOIN nd_experiment_genotype USING(nd_experiment_id)
906 JOIN nd_protocol USING(nd_protocol_id)
907 LEFT JOIN nd_protocolprop ON(nd_protocolprop.nd_protocol_id = nd_protocol.nd_protocol_id AND nd_protocolprop.type_id = $vcf_map_details_cvterm_id)
908 JOIN genotype USING(genotype_id)
909 LEFT JOIN genotypeprop AS igd_number_genotypeprop ON(igd_number_genotypeprop.genotype_id = genotype.genotype_id AND igd_number_genotypeprop.type_id = $igd_genotypeprop_cvterm_id)
910 JOIN project USING(project_id)
911 $where_clause
912 ORDER BY stock.stock_id, genotype.genotype_id ASC, accession_of_tissue_sample.stock_id DESC
913 $limit_clause
914 $offset_clause;";
916 #print STDERR Dumper $q;
917 my $h = $schema->storage->dbh()->prepare($q);
918 $h->execute();
919 my @genotypeprop_infos;
920 my %seen_protocol_ids;
921 while (my ($stock_id, $igd_number_json, $protocol_id, $protocol_name, $stock_name, $stock_type_id, $stock_type_name, $genotype_id, $genotype_uniquename, $genotype_description, $project_id, $project_name, $project_description, $accession_id, $accession_uniquename, $full_count) = $h->fetchrow_array()) {
923 my $germplasmName = '';
924 my $germplasmDbId = '';
926 my $igd_number_hash = $igd_number_json ? decode_json $igd_number_json : undef;
927 my $igd_number = $igd_number_hash ? $igd_number_hash->{'igd number'} : undef;
928 $igd_number = !$igd_number && $igd_number_hash ? $igd_number_hash->{'igd_number'} : undef;
930 if ($stock_type_name eq 'accession'){
931 $germplasmName = $stock_name;
932 $germplasmDbId = $stock_id;
934 if ($stock_type_name eq 'tissue_sample'){
935 $germplasmName = $accession_uniquename;
936 $germplasmDbId = $accession_id;
939 my $stock_object = CXGN::Stock::Accession->new({schema=>$self->bcs_schema, stock_id=>$germplasmDbId});
941 my %genotypeprop_info = (
942 markerProfileDbId => $genotype_id,
943 germplasmDbId => $germplasmDbId,
944 germplasmName => $germplasmName,
945 synonyms => $stock_object->synonyms,
946 stock_id => $stock_id,
947 stock_name => $stock_name,
948 stock_type_id => $stock_type_id,
949 stock_type_name => $stock_type_name,
950 genotypeDbId => $genotype_id,
951 genotypeUniquename => $genotype_uniquename,
952 genotypeDescription => $genotype_description,
953 analysisMethodDbId => $protocol_id,
954 analysisMethod => $protocol_name,
955 genotypingDataProjectDbId => $project_id,
956 genotypingDataProjectName => $project_name,
957 genotypingDataProjectDescription => $project_description,
958 igd_number => $igd_number,
959 full_count => $full_count
961 $seen_protocol_ids{$protocol_id}++;
962 push @genotypeprop_infos, \%genotypeprop_info;
965 $self->_genotypeprop_infos(\@genotypeprop_infos);
966 $self->_genotypeprop_infos_counter(0);
968 my @seen_protocol_ids = keys %seen_protocol_ids;
969 my %protocolprop_top_key_select_hash = map {$_ => 1} @protocolprop_top_key_select_arr;
970 my %selected_protocol_marker_info;
971 my %selected_protocol_top_key_info;
972 my $val;
974 foreach my $protocol_id (@seen_protocol_ids){
975 $protocolprop_h->execute($protocol_id);
976 while (my ($protocol_id, $marker_name, @protocolprop_info_return) = $protocolprop_h->fetchrow_array()) {
977 for my $s (0 .. scalar(@protocolprop_marker_hash_select_arr)-1){
978 $selected_protocol_marker_info{$protocol_id}->{$marker_name}->{$protocolprop_marker_hash_select->[$s]} = $protocolprop_info_return[$s];
980 $filtered_markers{$marker_name}++;
983 $protocolprop_top_key_h->execute($protocol_id);
984 while (my ($protocol_id, @protocolprop_top_key_return) = $protocolprop_top_key_h->fetchrow_array()) {
985 for my $s (0 .. scalar(@protocolprop_top_key_select_arr)-1){
986 my $protocolprop_i = $protocolprop_top_key_select->[$s];
987 if ($protocolprop_i eq 'header_information_lines' || $protocolprop_i eq 'marker_names') {
988 $val = decode_json $protocolprop_top_key_return[$s];
989 } else {
990 $val = $protocolprop_top_key_return[$s];
992 $selected_protocol_top_key_info{$protocol_id}->{$protocolprop_i} = $val;
995 if (exists($protocolprop_top_key_select_hash{'markers'})) {
996 $protocolprop_top_key_markers_h->execute($protocol_id);
997 while (my ($protocol_id, $markers_value) = $protocolprop_top_key_markers_h->fetchrow_array()) {
998 $selected_protocol_top_key_info{$protocol_id}->{'markers'} = decode_json $markers_value;
1001 if (exists($protocolprop_top_key_select_hash{'markers_array'})) {
1002 $protocolprop_top_key_markers_array_h->execute($protocol_id);
1003 while (my ($protocol_id, $markers_value) = $protocolprop_top_key_markers_array_h->fetchrow_array()) {
1004 $selected_protocol_top_key_info{$protocol_id}->{'markers_array'} = decode_json $markers_value;
1008 $self->_selected_protocol_marker_info(\%selected_protocol_marker_info);
1009 $self->_selected_protocol_top_key_info(\%selected_protocol_top_key_info);
1010 $self->_filtered_markers(\%filtered_markers);
1012 # Setup genotypeprop query handle
1013 my @genotypeprop_hash_select_arr;
1014 foreach (@$genotypeprop_hash_select){
1015 push @genotypeprop_hash_select_arr, "s.value->>'$_'";
1017 $self->_genotypeprop_hash_select_arr(\@genotypeprop_hash_select_arr);
1018 my $genotypeprop_hash_select_sql = scalar(@genotypeprop_hash_select_arr) > 0 ? ', '.join ',', @genotypeprop_hash_select_arr : '';
1020 my $filtered_markers_sql = '';
1021 # If filtered markers by providing a location range or chromosome these markers will be in %filered_markers, but we dont want to use this SQL if there are too many markers (>10000) )
1022 if (scalar(keys %filtered_markers) >0 && scalar(keys %filtered_markers) < 10000) {
1023 $filtered_markers_sql = " AND s.key IN ('". join ("','", keys %filtered_markers) ."')";
1026 my $genotypeprop_q = "SELECT s.key $genotypeprop_hash_select_sql
1027 FROM genotypeprop, jsonb_each(genotypeprop.value) as s
1028 WHERE genotypeprop_id = ? AND s.key != 'CHROM' AND type_id =$vcf_genotyping_cvterm_id $filtered_markers_sql;";
1029 my $genotypeprop_h = $schema->storage->dbh()->prepare($genotypeprop_q);
1030 $self->_genotypeprop_h($genotypeprop_h);
1032 my $q2 = "SELECT genotypeprop_id
1033 FROM genotypeprop WHERE genotype_id = ? AND type_id=$vcf_genotyping_cvterm_id $genotypeprop_chromosome_rank_string;";
1034 my $h2 = $schema->storage->dbh()->prepare($q2);
1035 $self->_iterator_genotypeprop_query_handle($h2);
1037 return;
1040 =head2 get_next_genotype_info()
1042 Function for getting genotype data iteratively. Iterative search retrieval minimizes memory usage.
1043 Should be used like:
1045 my $genotype_search = CXGN::Genotype::Search->new({
1046 bcs_schema=>$schema,
1047 ...etc
1049 $genotype_search->init_genotype_iterator();
1050 while (my ($count, $genotype_data) = $genotype_search->get_next_genotype_info) {
1051 #Do something with genotype data
1054 If you just want to get a file with the genotype result in a dosage matrix or VCF file, use get_cached_file_dosage_matrix or get_cached_file_VCF instead.
1055 If you want results in json format use get_cached_file_search_json
1056 If you want results in json format for only the metadata (no genotype call data), use get_cached_file_search_json()
1058 =cut
1060 sub get_next_genotype_info {
1061 my $self = shift;
1062 my $schema = $self->bcs_schema;
1063 my $trial_list = $self->trial_list;
1064 my $genotype_data_project_list = $self->genotype_data_project_list;
1065 my $protocol_id_list = $self->protocol_id_list;
1066 my $markerprofile_id_list = $self->markerprofile_id_list;
1067 my $accession_list = $self->accession_list;
1068 my $tissue_sample_list = $self->tissue_sample_list;
1069 my $marker_name_list = $self->marker_name_list;
1070 my $chromosome_list = $self->chromosome_list;
1071 my $start_position = $self->start_position;
1072 my $end_position = $self->end_position;
1073 my $genotypeprop_hash_select = $self->genotypeprop_hash_select;
1074 my $protocolprop_top_key_select = $self->protocolprop_top_key_select;
1075 my $protocolprop_marker_hash_select = $self->protocolprop_marker_hash_select;
1076 my $marker_search_hash_list = $self->marker_search_hash_list;
1077 my $marker_score_search_hash_list = $self->marker_score_search_hash_list;
1078 my $return_only_first_genotypeprop_for_stock = $self->return_only_first_genotypeprop_for_stock;
1079 my $h = $self->_iterator_query_handle();
1080 my $h_genotypeprop = $self->_iterator_genotypeprop_query_handle();
1081 my $protocolprop_markers_h = $self->_protocolprop_markers_h();
1082 my $protocolprop_top_key_h = $self->_protocolprop_top_key_h();
1083 my $protocolprop_top_key_markers_h = $self->_protocolprop_top_key_markers_h();
1084 my $genotypeprop_h = $self->_genotypeprop_h();
1085 my $protocolprop_top_key_markers_array_h = $self->_protocolprop_top_key_markers_array_h();
1086 my $protocolprop_marker_hash_select_arr = $self->_protocolprop_marker_hash_select_arr();
1087 my $protocolprop_top_key_select_arr = $self->_protocolprop_top_key_select_arr();
1088 my $genotypeprop_hash_select_arr = $self->_genotypeprop_hash_select_arr();
1089 my $vcf_genotyping_cvterm_id = $self->_vcf_genotyping_cvterm_id();
1090 my $vcf_map_details_cvterm_id = $self->_vcf_map_details_cvterm_id();
1091 my $vcf_map_details_markers_cvterm_id = $self->_vcf_map_details_markers_cvterm_id();
1092 my $vcf_map_details_markers_array_cvterm_id = $self->_vcf_map_details_markers_array_cvterm_id();
1093 my $igd_genotypeprop_cvterm_id = $self->_igd_genotypeprop_cvterm_id();
1094 my $accession_cvterm_id = $self->_accession_cvterm_id();
1095 my $tissue_sample_cvterm_id = $self->_tissue_sample_cvterm_id();
1096 my $tissue_sample_of_cvterm_id = $self->_tissue_sample_of_cvterm_id();
1098 my $total_count = 0;
1099 my @genotypeprop_array;
1100 my %protocolprop_hash;
1102 my %selected_protocol_marker_info = %{$self->_selected_protocol_marker_info};
1103 my %selected_protocol_top_key_info = %{$self->_selected_protocol_top_key_info};
1104 my %filtered_markers = %{$self->_filtered_markers};
1105 my $genotypeprop_infos = $self->_genotypeprop_infos;
1106 my $genotypeprop_infos_counter = $self->_genotypeprop_infos_counter;
1108 if ($genotypeprop_infos->[$genotypeprop_infos_counter]) {
1109 my %genotypeprop_info = %{$genotypeprop_infos->[$genotypeprop_infos_counter]};
1110 my $genotype_id = $genotypeprop_info{markerProfileDbId};
1111 my $protocol_id = $genotypeprop_info{analysisMethodDbId};
1112 my $full_count = $genotypeprop_info{full_count};
1114 $h_genotypeprop->execute($genotype_id);
1115 while (my ($genotypeprop_id) = $h_genotypeprop->fetchrow_array) {
1116 $genotypeprop_h->execute($genotypeprop_id);
1117 while (my ($marker_name, @genotypeprop_info_return) = $genotypeprop_h->fetchrow_array()) {
1118 for my $s (0 .. scalar(@$genotypeprop_hash_select_arr)-1){
1119 $genotypeprop_info{selected_genotype_hash}->{$marker_name}->{$genotypeprop_hash_select->[$s]} = $genotypeprop_info_return[$s];
1124 my $selected_marker_info = $selected_protocol_marker_info{$protocol_id} ? $selected_protocol_marker_info{$protocol_id} : {};
1125 my $selected_protocol_info = $selected_protocol_top_key_info{$protocol_id} ? $selected_protocol_top_key_info{$protocol_id} : {};
1126 my @all_protocol_marker_names = keys %$selected_marker_info;
1127 $selected_protocol_info->{markers} = $selected_marker_info;
1128 $genotypeprop_info{resultCount} = scalar(keys %{$genotypeprop_info{selected_genotype_hash}});
1129 $genotypeprop_info{all_protocol_marker_names} = \@all_protocol_marker_names;
1130 $genotypeprop_info{selected_protocol_hash} = $selected_protocol_info;
1132 $self->_genotypeprop_infos_counter($self->_genotypeprop_infos_counter + 1);
1134 return ($full_count, \%genotypeprop_info);
1137 return;
1140 sub key {
1141 my $self = shift;
1142 my $datatype = shift;
1144 my $json = JSON->new();
1145 #preserve order of hash keys to get same text
1146 $json = $json->canonical();
1147 my $accessions = $json->encode( $self->accession_list() || [] );
1148 my $tissues = $json->encode( $self->tissue_sample_list() || [] );
1149 my $trials = $json->encode( $self->trial_list() || [] );
1150 my $protocols = $json->encode( $self->protocol_id_list() || [] );
1151 my $markerprofiles = $json->encode( $self->markerprofile_id_list() || [] );
1152 my $genotypedataprojects = $json->encode( $self->genotype_data_project_list() || [] );
1153 my $markernames = $json->encode( $self->marker_name_list() || [] );
1154 my $genotypeprophash = $json->encode( $self->genotypeprop_hash_select() || [] );
1155 my $protocolprophash = $json->encode( $self->protocolprop_top_key_select() || [] );
1156 my $protocolpropmarkerhash = $json->encode( $self->protocolprop_marker_hash_select() || [] );
1157 my $chromosomes = $json->encode( $self->chromosome_list() || [] );
1158 my $start = $self->start_position() || '' ;
1159 my $end = $self->end_position() || '';
1160 my $prevent_transpose = $self->prevent_transpose() || '';
1161 my $key = md5_hex($accessions.$tissues.$trials.$protocols.$markerprofiles.$genotypedataprojects.$markernames.$genotypeprophash.$protocolprophash.$protocolpropmarkerhash.$chromosomes.$start.$end.$self->return_only_first_genotypeprop_for_stock().$prevent_transpose.$self->limit().$self->offset()."_$datatype");
1162 #print STDERR Dumper($genotypeprophash);
1163 return $key;
1166 =head2 get_cached_file_search_json()
1168 Function for getting the file handle for the genotype search result from cache. Will write the cached file if it does not exist.
1169 Returns the genotype result in a line-by-line json format.
1170 Uses the file iterator to write the cached file, so that it uses little memory.
1172 First line in file has all marker objects, while subsequent lines have markerprofiles for each sample
1173 If you want results in json format for only the metadata (no genotype call data), pass 1 for metadata_only
1175 =cut
1177 sub get_cached_file_search_json {
1178 my $self = shift;
1179 my $shared_cluster_dir_config = shift;
1180 my $metadata_only = shift;
1181 my $protocol_ids = $self->protocol_id_list;
1183 my $metadata_only_string = $metadata_only ? "metadata_only" : "all_data";
1184 my $key = $self->key("get_cached_file_search_json_v03_".$metadata_only_string);
1185 $self->cache( Cache::File->new( cache_root => $self->cache_root() ));
1187 my $file_handle;
1188 if ($self->cache()->exists($key) && !$self->forbid_cache()) {
1189 $file_handle = $self->cache()->handle($key);
1190 } else {
1191 # Set the temp dir and temp output file
1192 my $tmp_output_dir = $shared_cluster_dir_config."/tmp_genotype_download_json";
1193 print STDERR "creating cached json file in $tmp_output_dir\n";
1194 mkdir $tmp_output_dir if ! -d $tmp_output_dir;
1195 my ($tmp_fh, $tempfile) = tempfile(
1196 "wizard_download_XXXXX",
1197 DIR=> $tmp_output_dir,
1200 my @all_marker_objects;
1201 foreach (@$protocol_ids) {
1202 my $protocol = CXGN::Genotype::Protocol->new({
1203 bcs_schema => $self->bcs_schema,
1204 nd_protocol_id => $_,
1205 chromosome_list=>$self->chromosome_list,
1206 start_position=>$self->start_position,
1207 end_position=>$self->end_position,
1208 marker_name_list=>$self->marker_name_list
1210 my $markers = $protocol->markers;
1211 push @all_marker_objects, values %$markers;
1214 foreach (@all_marker_objects) {
1215 $self->_filtered_markers()->{$_->{name}}++;
1218 $self->init_genotype_iterator();
1220 #VCF should be sorted by chromosome and position
1221 no warnings 'uninitialized';
1222 @all_marker_objects = sort { $a->{chrom} cmp $b->{chrom} || $a->{pos} <=> $b->{pos} || $a->{name} cmp $b->{name} } @all_marker_objects;
1223 @all_marker_objects = $self->_check_filtered_markers(\@all_marker_objects);
1225 my $counter = 0;
1226 while (my $geno = $self->get_next_genotype_info) {
1228 # OLD GENOTYPING PROTCOLS DID NOT HAVE ND_PROTOCOLPROP INFO...
1229 if (scalar(@all_marker_objects) == 0) {
1230 foreach my $o (sort genosort keys %{$geno->{selected_genotype_hash}}) {
1231 push @all_marker_objects, {name => $o};
1233 @all_marker_objects = $self->_check_filtered_markers(\@all_marker_objects);
1236 if ($metadata_only) {
1237 @all_marker_objects = [];
1238 delete $geno->{selected_genotype_hash};
1239 delete $geno->{selected_protocol_hash};
1240 delete $geno->{all_protocol_marker_names};
1243 my $genotype_string = encode_json $geno;
1244 $genotype_string .= "\n";
1245 if ($counter == 0) {
1246 my $marker_string = encode_json \@all_marker_objects;
1247 $marker_string .= "\n";
1248 write_file($tempfile, {append => 1}, $marker_string);
1250 write_file($tempfile, {append => 1}, $genotype_string);
1251 $counter++;
1253 close $tempfile;
1255 open my $out_copy, '<', $tempfile or die "Can't open output file: $!";
1257 $self->cache()->set($key, '');
1258 $file_handle = $self->cache()->handle($key);
1259 copy($out_copy, $file_handle);
1261 close $out_copy;
1262 $file_handle = $self->cache()->handle($key);
1264 return $file_handle;
1267 =head2 get_cached_file_dosage_matrix()
1269 Function for getting the file handle for the genotype search result from cache. Will write the cached file if it does not exist.
1270 Returns the genotype result as a dosage matrix format.
1271 Uses the file iterator to write the cached file, so that it uses little memory.
1273 =cut
1275 sub get_cached_file_dosage_matrix {
1276 my $self = shift;
1277 my $shared_cluster_dir_config = shift;
1278 my $backend_config = shift;
1279 my $cluster_host_config = shift;
1280 my $web_cluster_queue_config = shift;
1281 my $basepath_config = shift;
1282 my $protocol_ids = $self->protocol_id_list;
1284 my $key = $self->key("get_cached_file_dosage_matrix_v03");
1285 $self->cache( Cache::File->new( cache_root => $self->cache_root() ));
1287 my $file_handle;
1288 if ($self->cache()->exists($key) && !$self->forbid_cache()) {
1289 $file_handle = $self->cache()->handle($key);
1291 else {
1292 # Set the temp dir and temp output file
1293 my $tmp_output_dir = $shared_cluster_dir_config."/tmp_genotype_download_dosage_matrix";
1294 mkdir $tmp_output_dir if ! -d $tmp_output_dir;
1295 my ($tmp_fh, $tempfile) = tempfile(
1296 "wizard_download_XXXXX",
1297 DIR=> $tmp_output_dir,
1300 my @all_marker_objects;
1301 foreach (@$protocol_ids) {
1302 my $protocol = CXGN::Genotype::Protocol->new({
1303 bcs_schema => $self->bcs_schema,
1304 nd_protocol_id => $_,
1305 chromosome_list=>$self->chromosome_list,
1306 start_position=>$self->start_position,
1307 end_position=>$self->end_position,
1308 marker_name_list=>$self->marker_name_list
1310 my $markers = $protocol->markers;
1311 push @all_marker_objects, values %$markers;
1314 foreach (@all_marker_objects) {
1315 $self->_filtered_markers()->{$_->{name}}++;
1317 $self->init_genotype_iterator();
1319 #VCF should be sorted by chromosome and position
1320 no warnings 'uninitialized';
1321 @all_marker_objects = sort { $a->{chrom} cmp $b->{chrom} || $a->{pos} <=> $b->{pos} || $a->{name} cmp $b->{name} } @all_marker_objects;
1322 @all_marker_objects = $self->_check_filtered_markers(\@all_marker_objects);
1324 my $counter = 0;
1325 while (my $geno = $self->get_next_genotype_info) {
1327 # OLD GENOTYPING PROTCOLS DID NOT HAVE ND_PROTOCOLPROP INFO...
1328 if (scalar(@all_marker_objects) == 0) {
1329 foreach my $o (sort genosort keys %{$geno->{selected_genotype_hash}}) {
1330 push @all_marker_objects, {name => $o};
1332 @all_marker_objects = $self->_check_filtered_markers(\@all_marker_objects);
1335 my $genotype_string = "";
1336 if ($counter == 0) {
1337 $genotype_string .= "Marker\t";
1338 foreach my $m (@all_marker_objects) {
1339 $genotype_string .= $m->{name} . "\t";
1341 $genotype_string .= "\n";
1343 my $genotype_id = $geno->{germplasmName};
1344 if (!$self->return_only_first_genotypeprop_for_stock) {
1345 $genotype_id = $geno->{germplasmName}."|".$geno->{markerProfileDbId};
1347 my $genotype_data_string = "";
1348 foreach my $m (@all_marker_objects) {
1349 my $current_genotype = $geno->{selected_genotype_hash}->{$m->{name}}->{DS};
1350 $genotype_data_string .= $current_genotype."\t";
1353 $genotype_string .= $genotype_id."\t".$genotype_data_string."\n";
1355 write_file($tempfile, {append => 1}, $genotype_string);
1356 $counter++;
1359 my $transpose_tempfile = $tempfile . "_transpose";
1361 my $cmd = CXGN::Tools::Run->new(
1363 backend => $backend_config,
1364 submit_host => $cluster_host_config,
1365 temp_base => $tmp_output_dir,
1366 queue => $web_cluster_queue_config,
1367 do_cleanup => 0,
1368 out_file => $transpose_tempfile,
1369 # out_file => $transpose_tempfile,
1370 # don't block and wait if the cluster looks full
1371 max_cluster_jobs => 1_000_000_000,
1375 my $out_copy;
1376 if ($self->prevent_transpose()) {
1377 open $out_copy, '<', $tempfile or die "Can't open output file: $!";
1379 else {
1380 # Do the transposition job on the cluster
1381 $cmd->run_cluster(
1382 "perl ",
1383 $basepath_config."/bin/transpose_matrix.pl",
1384 $tempfile,
1386 $cmd->is_cluster(1);
1387 $cmd->wait;
1389 open $out_copy, '<', $transpose_tempfile or die "Can't open output file: $!";
1392 $self->cache()->set($key, '');
1393 $file_handle = $self->cache()->handle($key);
1394 copy($out_copy, $file_handle);
1396 close $out_copy;
1397 $file_handle = $self->cache()->handle($key);
1399 return $file_handle;
1402 =head2 get_cached_file_dosage_matrix_compute_from_parents()
1403 Computes the genotypes for the queried accessions computed from the parental dosages. Parents are known from pedigrees of accessions.
1404 Function for getting the file handle for the genotype search result from cache. Will write the cached file if it does not exist.
1405 Returns the genotype result as a dosage matrix format.
1406 Uses the file iterator to write the cached file, so that it uses little memory.
1407 =cut
1409 sub get_cached_file_dosage_matrix_compute_from_parents {
1410 my $self = shift;
1411 my $shared_cluster_dir_config = shift;
1412 my $backend_config = shift;
1413 my $cluster_host_config = shift;
1414 my $web_cluster_queue_config = shift;
1415 my $basepath_config = shift;
1416 my $schema = $self->bcs_schema;
1417 my $protocol_ids = $self->protocol_id_list;
1418 my $marker_name_list = $self->marker_name_list;
1419 my $accession_ids = $self->accession_list;
1420 my $cache_root_dir = $self->cache_root();
1422 if (scalar(@$protocol_ids)>1) {
1423 die "Only one protocol at a time can be done when computing genotypes from parents\n";
1425 my $protocol_id = $protocol_ids->[0];
1427 my $key = $self->key("get_cached_file_dosage_matrix_compute_from_parents_v03");
1428 $self->cache( Cache::File->new( cache_root => $cache_root_dir ));
1430 my $file_handle;
1431 if ($self->cache()->exists($key) && !$self->forbid_cache()) {
1432 $file_handle = $self->cache()->handle($key);
1434 else {
1435 # Set the temp dir and temp output file
1436 my $tmp_output_dir = $shared_cluster_dir_config."/tmp_genotype_download_dosage_matrix_compute_from_parents";
1437 mkdir $tmp_output_dir if ! -d $tmp_output_dir;
1438 my ($tmp_fh, $tempfile) = tempfile(
1439 "wizard_download_XXXXX",
1440 DIR=> $tmp_output_dir,
1443 my $accession_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'accession', 'stock_type')->cvterm_id();
1444 my $female_parent_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'female_parent', 'stock_relationship')->cvterm_id();
1445 my $male_parent_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'male_parent', 'stock_relationship')->cvterm_id();
1447 my $accession_list_string = join ',', @$accession_ids;
1448 my $q = "SELECT accession.stock_id, female_parent.stock_id, male_parent.stock_id
1449 FROM stock AS accession
1450 JOIN stock_relationship AS female_parent_rel ON(accession.stock_id=female_parent_rel.object_id AND female_parent_rel.type_id=$female_parent_cvterm_id)
1451 JOIN stock AS female_parent ON(female_parent_rel.subject_id = female_parent.stock_id AND female_parent.type_id=$accession_cvterm_id)
1452 JOIN stock_relationship AS male_parent_rel ON(accession.stock_id=male_parent_rel.object_id AND male_parent_rel.type_id=$male_parent_cvterm_id)
1453 JOIN stock AS male_parent ON(male_parent_rel.subject_id = male_parent.stock_id AND male_parent.type_id=$accession_cvterm_id)
1454 WHERE accession.stock_id IN ($accession_list_string) AND accession.type_id=$accession_cvterm_id ORDER BY accession.stock_id;";
1455 my $h = $schema->storage->dbh()->prepare($q);
1456 $h->execute();
1457 my @accession_stock_ids_found = ();
1458 my @female_stock_ids_found = ();
1459 my @male_stock_ids_found = ();
1460 while (my ($accession_stock_id, $female_parent_stock_id, $male_parent_stock_id) = $h->fetchrow_array()) {
1461 push @accession_stock_ids_found, $accession_stock_id;
1462 push @female_stock_ids_found, $female_parent_stock_id;
1463 push @male_stock_ids_found, $male_parent_stock_id;
1466 # print STDERR Dumper \@accession_stock_ids_found;
1467 # print STDERR Dumper \@female_stock_ids_found;
1468 # print STDERR Dumper \@male_stock_ids_found;
1470 my %unique_germplasm;
1471 my $protocol = CXGN::Genotype::Protocol->new({
1472 bcs_schema => $schema,
1473 nd_protocol_id => $protocol_id,
1474 chromosome_list=>$self->chromosome_list,
1475 start_position=>$self->start_position,
1476 end_position=>$self->end_position,
1477 marker_name_list=>$self->marker_name_list
1479 my $markers = $protocol->markers;
1480 my @all_marker_objects = values %$markers;
1482 foreach (@all_marker_objects) {
1483 $self->_filtered_markers()->{$_->{name}}++;
1486 no warnings 'uninitialized';
1487 @all_marker_objects = sort { $a->{chrom} cmp $b->{chrom} || $a->{pos} <=> $b->{pos} || $a->{name} cmp $b->{name} } @all_marker_objects;
1488 @all_marker_objects = $self->_check_filtered_markers(\@all_marker_objects);
1490 my $counter = 0;
1491 for my $i (0..scalar(@accession_stock_ids_found)-1) {
1492 my $female_stock_id = $female_stock_ids_found[$i];
1493 my $male_stock_id = $male_stock_ids_found[$i];
1494 my $accession_stock_id = $accession_stock_ids_found[$i];
1496 my $dataset = CXGN::Dataset::Cache->new({
1497 people_schema=>$self->people_schema,
1498 schema=>$schema,
1499 cache_root=>$cache_root_dir,
1500 accessions=>[$female_stock_id, $male_stock_id]
1502 my $genotypes = $dataset->retrieve_genotypes($protocol_id, ['DS'], ['markers'], ['name'], 1, $self->chromosome_list, $self->start_position, $self->end_position, $self->marker_name_list);
1504 # For old protocols with no protocolprop info...
1505 if (scalar(@all_marker_objects) == 0) {
1506 foreach my $o (sort genosort keys %{$genotypes->[0]->{selected_genotype_hash}}) {
1507 push @all_marker_objects, {name => $o};
1509 @all_marker_objects = $self->_check_filtered_markers(\@all_marker_objects);
1512 my $genotype_string = "";
1513 if ($counter == 0) {
1514 $genotype_string .= "Marker\t";
1515 foreach my $m (@all_marker_objects) {
1516 $genotype_string .= $m->{name} . "\t";
1518 $genotype_string .= "\n";
1521 my $geno = CXGN::Genotype::ComputeHybridGenotype->new({
1522 parental_genotypes=>$genotypes,
1523 marker_objects=>\@all_marker_objects
1525 my $progeny_genotype = $geno->get_hybrid_genotype();
1526 my $genotype_string_scores = join "\t", @$progeny_genotype;
1528 my $genotype_id = $accession_stock_id;
1529 if (!$self->return_only_first_genotypeprop_for_stock) {
1530 $genotype_id = $accession_stock_id."|".$geno->{markerProfileDbId};
1533 $genotype_string .= $genotype_id."\t".$genotype_string_scores."\n";
1534 write_file($tempfile, {append => 1}, $genotype_string);
1535 $counter++;
1538 my $transpose_tempfile = $tempfile . "_transpose";
1540 my $cmd = CXGN::Tools::Run->new(
1542 backend => $backend_config,
1543 submit_host => $cluster_host_config,
1544 temp_base => $tmp_output_dir,
1545 queue => $web_cluster_queue_config,
1546 do_cleanup => 0,
1547 out_file => $transpose_tempfile,
1548 # out_file => $transpose_tempfile,
1549 # don't block and wait if the cluster looks full
1550 max_cluster_jobs => 1_000_000_000,
1554 my $out_copy;
1555 if ($self->prevent_transpose()) {
1556 open $out_copy, '<', $tempfile or die "Can't open output file: $!";
1558 else {
1559 # Do the transposition job on the cluster
1560 $cmd->run_cluster(
1561 "perl ",
1562 $basepath_config."/bin/transpose_matrix.pl",
1563 $tempfile,
1565 $cmd->is_cluster(1);
1566 $cmd->wait;
1568 open $out_copy, '<', $transpose_tempfile or die "Can't open output file: $!";
1571 $self->cache()->set($key, '');
1572 $file_handle = $self->cache()->handle($key);
1573 copy($out_copy, $file_handle);
1575 close $out_copy;
1576 $file_handle = $self->cache()->handle($key);
1578 return $file_handle;
1581 sub get_cached_file_VCF {
1582 my $self = shift;
1583 my $shared_cluster_dir_config = shift;
1584 my $backend_config = shift;
1585 my $cluster_host_config = shift;
1586 my $web_cluster_queue_config = shift;
1587 my $basepath_config = shift;
1588 my $forbid_cache = $self->forbid_cache();
1590 my $key = $self->key("get_cached_file_VCF_v04");
1591 $self->cache( Cache::File->new( cache_root => $self->cache_root() ));
1592 my $protocol_ids = $self->protocol_id_list;
1594 # print STDERR "\nget_cached_file_VCF: protocol_ids: @$protocol_ids\n";
1595 my $file_handle;
1596 if ($self->cache()->exists($key) && !$self->forbid_cache()) {
1597 $file_handle = $self->cache()->handle($key);
1598 print STDERR "\nget_cached_file_VCF: go file handle $file_handle\n";
1600 } else {
1601 # Set the temp dir and temp output file
1602 my $tmp_output_dir = $shared_cluster_dir_config."/tmp_genotype_download_VCF";
1603 mkdir $tmp_output_dir if ! -d $tmp_output_dir;
1604 my ($tmp_fh, $tempfile) = tempfile(
1605 "wizard_download_XXXXX",
1606 DIR=> $tmp_output_dir,
1609 my $time = DateTime->now();
1610 my $timestamp = $time->ymd()."_".$time->hms();
1612 my @all_protocol_info_lines;
1614 #Get all marker information for the protocol(s) requested. this is important if they are requesting subsets of markers or if they are querying more than one protocol at once. Also important for ordering VCF output. Old genotypes did not have protocolprop marker info so markers are taken from first genotypeprop return below.
1615 my @all_marker_objects;
1616 my %unique_germplasm;
1617 my @protocol_names;
1619 foreach (@$protocol_ids) {
1620 my $protocol = CXGN::Genotype::Protocol->new({
1621 bcs_schema => $self->bcs_schema,
1622 nd_protocol_id => $_,
1623 chromosome_list=>$self->chromosome_list,
1624 start_position=>$self->start_position,
1625 end_position=>$self->end_position,
1626 marker_name_list=>$self->marker_name_list
1628 my $markers = $protocol->markers;
1629 push @all_protocol_info_lines, @{$protocol->header_information_lines};
1630 push @all_marker_objects, values %$markers;
1631 push @protocol_names, $protocol->protocol_name();
1634 push @all_protocol_info_lines, "##source=FILE GENERATED BY BREEDBASE";
1635 push @all_protocol_info_lines, "##fileDate=$timestamp";
1636 push @all_protocol_info_lines, "##Genotyping protocol id(s)=@$protocol_ids";
1637 push @all_protocol_info_lines, "##Genotyping protocol name(s)=@protocol_names";
1639 foreach (@all_marker_objects) {
1640 $self->_filtered_markers()->{$_->{name}}++;
1643 $self->init_genotype_iterator();
1645 #VCF should be sorted by chromosome and position
1646 no warnings 'uninitialized';
1647 @all_marker_objects = sort { $a->{chrom} cmp $b->{chrom} || $a->{pos} <=> $b->{pos} || $a->{name} cmp $b->{name} } @all_marker_objects;
1648 @all_marker_objects = $self->_check_filtered_markers(\@all_marker_objects);
1650 my $counter = 0;
1651 my $usingGT;
1652 while (my $geno = $self->get_next_genotype_info) {
1653 # OLD GENOTYPING PROTCOLS DID NOT HAVE ND_PROTOCOLPROP INFO...
1654 if (scalar(@all_marker_objects) == 0) {
1655 foreach my $o (sort genosort keys %{$geno->{selected_genotype_hash}}) {
1656 push @all_marker_objects, {name => $o};
1658 @all_marker_objects = $self->_check_filtered_markers(\@all_marker_objects);
1661 $unique_germplasm{$geno->{germplasmDbId}}++;
1663 my $genotype_string = "";
1664 if ($counter == 0) {
1665 $genotype_string .= "#CHROM\t";
1666 foreach my $m (@all_marker_objects) {
1667 my $chrom = $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{chrom};
1669 if (! $chrom) {
1670 ($chrom) = split /\_/, $m->{name};
1671 #print STDERR "Warning! No chrom data, using $chrom extracted from $m->{name}\n";
1673 #$genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{chrom} . "\t";
1674 $genotype_string .= $chrom ."\t";
1676 $genotype_string .= "\n";
1677 $genotype_string .= "POS\t";
1678 foreach my $m (@all_marker_objects) {
1679 my $pos = $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{pos};
1680 if ($pos eq "") {
1681 (undef, $pos) = split /\_/, $m->{name};
1682 if (! $pos) {
1683 $pos = 0;
1684 print STDERR "Warning! No position data, using 0\n";
1686 } elsif ($pos eq ".") { # pos must be an integer
1687 $pos = 0;
1689 #$genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{pos} . "\t";
1690 $genotype_string .= $pos ."\t";
1692 $genotype_string .= "\n";
1693 $genotype_string .= "ID\t";
1694 foreach my $m (@all_marker_objects) {
1695 $genotype_string .= $m->{name} . "\t";
1697 $genotype_string .= "\n";
1698 $genotype_string .= "REF\t";
1699 foreach my $m (@all_marker_objects) {
1700 $genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{ref} . "\t";
1702 $genotype_string .= "\n";
1703 $genotype_string .= "ALT\t";
1704 foreach my $m (@all_marker_objects) {
1705 $genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{alt} . "\t";
1707 $genotype_string .= "\n";
1708 $genotype_string .= "QUAL\t";
1709 foreach my $m (@all_marker_objects) {
1710 $genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{qual} . "\t";
1712 $genotype_string .= "\n";
1713 $genotype_string .= "FILTER\t";
1714 foreach my $m (@all_marker_objects) {
1715 $genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{filter} . "\t";
1717 $genotype_string .= "\n";
1718 $genotype_string .= "INFO\t";
1719 foreach my $m (@all_marker_objects) {
1720 $genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{info} . "\t";
1722 $genotype_string .= "\n";
1723 $genotype_string .= "FORMAT\t";
1724 foreach my $m (@all_marker_objects) {
1725 my $format = $m->{format};
1726 my @format_array;
1727 #In case of old genotyping protocols where there was no protocolprop marker info
1728 if (!$format) {
1729 my $first_g = $geno->{selected_genotype_hash}->{$m->{name}};
1730 if (defined($first_g->{'GT'})) {
1731 push @format_array, 'GT';
1733 foreach my $k (sort keys %$first_g) {
1734 if ($k eq 'GT') {
1735 } elsif (defined($first_g->{$k})) {
1736 push @format_array, $k;
1739 } else {
1740 @format_array = split ':', $format;
1743 if (scalar(@format_array) > 1) { #ONLY ADD NT FOR NOT OLD GENOTYPING PROTOCOLS
1744 my %format_check = map {$_ => 1} @format_array;
1745 if (!exists($format_check{'NT'})) {
1746 push @format_array, 'NT';
1748 if (!exists($format_check{'DS'})) {
1749 push @format_array, 'DS';
1752 $format = join ':', @format_array;
1753 $genotype_string .= $format . "\t";
1754 $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{format} = $format;
1755 $m->{format} = $format;
1757 $genotype_string .= "\n";
1759 my $genotype_id = $geno->{germplasmName};
1760 if (!$self->return_only_first_genotypeprop_for_stock) {
1761 $genotype_id = $geno->{germplasmName}."|".$geno->{markerProfileDbId};
1763 my $genotype_data_string = "";
1764 foreach my $m (@all_marker_objects) {
1765 my @current_geno = ();
1766 my $name = $m->{name};
1767 my $format = $m->{format};
1768 my @format;
1769 my $gt;
1771 #In case of old genotyping protocols where there was no protocolprop marker info
1772 if (!$format) {
1773 my $first_g = $geno->{selected_genotype_hash}->{$name};
1774 foreach my $k (sort keys %$first_g) {
1775 if (defined($first_g->{$k})) {
1776 push @format, $k;
1779 } else {
1780 @format = split ':', $format;
1783 #VCF requires the GT field to be first
1784 if (defined($geno->{selected_genotype_hash}->{$m->{name}}->{'GT'})) {
1785 $usingGT = 1;
1786 my $val = $geno->{selected_genotype_hash}->{$m->{name}}->{'GT'};
1787 #if ($val eq '') {
1788 # $val = './.';
1790 push @current_geno, $val;
1791 } elsif ($usingGT) {
1792 push @current_geno, './.';
1794 foreach my $format_key (@format) {
1795 my $val = $geno->{selected_genotype_hash}->{$m->{name}}->{$format_key};
1796 if ($format_key eq 'GT') {
1797 } else {
1798 push @current_geno, $val;
1801 my $current_g = join ':', @current_geno;
1802 $genotype_data_string .= $current_g."\t";
1804 $genotype_string .= $genotype_id."\t".$genotype_data_string."\n";
1806 write_file($tempfile, {append => 1}, $genotype_string);
1807 $counter++;
1810 my $transpose_tempfile = $tempfile . "_transpose";
1812 my $cmd = CXGN::Tools::Run->new(
1814 backend => $backend_config,
1815 submit_host => $cluster_host_config,
1816 temp_base => $tmp_output_dir,
1817 queue => $web_cluster_queue_config,
1818 do_cleanup => 0,
1819 out_file => $transpose_tempfile,
1820 # out_file => $transpose_tempfile,
1821 # don't block and wait if the cluster looks full
1822 max_cluster_jobs => 1_000_000_000,
1826 # Do the transposition job on the cluster
1827 $cmd->run_cluster(
1828 "perl ",
1829 $basepath_config."/bin/transpose_matrix.pl",
1830 $tempfile,
1832 $cmd->is_cluster(1);
1833 $cmd->wait;
1835 my $transpose_tempfile_hdr = $tempfile . "_transpose_hdr";
1837 open my $in, '<', $transpose_tempfile or die "Can't read input file: $!";
1838 open my $out, '>', $transpose_tempfile_hdr or die "Can't write output file: $!";
1840 #Get synonyms of the accessions
1841 my $stocklookup = CXGN::Stock::StockLookup->new({schema => $self->bcs_schema});
1842 my @accession_ids = keys %unique_germplasm;
1843 my $synonym_hash = $stocklookup->get_stock_synonyms('stock_id', 'accession', \@accession_ids);
1844 my $synonym_string = "##SynonymsOfAccessions=\"";
1845 while( my( $uniquename, $synonym_list ) = each %{$synonym_hash}){
1846 if(scalar(@{$synonym_list})>0){
1847 if(not length($synonym_string)<1){
1848 $synonym_string.=" ";
1850 $synonym_string.=$uniquename."=(";
1851 $synonym_string.= (join ", ", @{$synonym_list}).")";
1854 $synonym_string .= "\"";
1855 push @all_protocol_info_lines, $synonym_string;
1857 my $vcf_header = join "\n", @all_protocol_info_lines;
1858 $vcf_header .= "\n";
1860 print $out $vcf_header;
1862 while( <$in> )
1864 print $out $_;
1866 close $in;
1867 close $out;
1869 open my $out_copy, '<', $transpose_tempfile_hdr or die "Can't open output file: $!";
1871 $self->cache()->set($key, '');
1872 $file_handle = $self->cache()->handle($key);
1873 copy($out_copy, $file_handle);
1875 close $out_copy;
1876 $file_handle = $self->cache()->handle($key);
1878 return $file_handle;
1881 =head2 get_cached_file_VCF_compute_from_parents()
1882 Computes the genotypes for the queried accessions computed from the parental dosages. Parents are known from pedigrees of accessions.
1883 Function for getting the file handle for the genotype search result from cache. Will write the cached file if it does not exist.
1884 Returns the genotype result as a dosage matrix format.
1885 Uses the file iterator to write the cached file, so that it uses little memory.
1886 =cut
1888 sub get_cached_file_VCF_compute_from_parents {
1889 my $self = shift;
1890 my $shared_cluster_dir_config = shift;
1891 my $backend_config = shift;
1892 my $cluster_host_config = shift;
1893 my $web_cluster_queue_config = shift;
1894 my $basepath_config = shift;
1895 my $schema = $self->bcs_schema;
1896 my $protocol_ids = $self->protocol_id_list;
1897 my $accession_ids = $self->accession_list;
1898 my $cache_root_dir = $self->cache_root();
1899 my $marker_name_list = $self->marker_name_list;
1901 if (scalar(@$protocol_ids)>1) {
1902 die "Only one protocol at a time can be done when computing genotypes from parents\n";
1904 my $protocol_id = $protocol_ids->[0];
1906 my $key = $self->key("get_cached_file_VCF_compute_from_parents_v04");
1907 $self->cache( Cache::File->new( cache_root => $cache_root_dir ));
1909 my $file_handle;
1910 if ($self->cache()->exists($key) && !$self->forbid_cache()) {
1911 $file_handle = $self->cache()->handle($key);
1913 else {
1914 # Set the temp dir and temp output file
1915 my $tmp_output_dir = $shared_cluster_dir_config."/tmp_genotype_download_VCF_compute_from_parents";
1916 mkdir $tmp_output_dir if ! -d $tmp_output_dir;
1917 my ($tmp_fh, $tempfile) = tempfile(
1918 "wizard_download_XXXXX",
1919 DIR=> $tmp_output_dir,
1922 my $accession_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'accession', 'stock_type')->cvterm_id();
1923 my $female_parent_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'female_parent', 'stock_relationship')->cvterm_id();
1924 my $male_parent_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'male_parent', 'stock_relationship')->cvterm_id();
1926 my $accession_list_string = join ',', @$accession_ids;
1927 my $q = "SELECT accession.stock_id, female_parent.stock_id, male_parent.stock_id
1928 FROM stock AS accession
1929 JOIN stock_relationship AS female_parent_rel ON(accession.stock_id=female_parent_rel.object_id AND female_parent_rel.type_id=$female_parent_cvterm_id)
1930 JOIN stock AS female_parent ON(female_parent_rel.subject_id = female_parent.stock_id AND female_parent.type_id=$accession_cvterm_id)
1931 JOIN stock_relationship AS male_parent_rel ON(accession.stock_id=male_parent_rel.object_id AND male_parent_rel.type_id=$male_parent_cvterm_id)
1932 JOIN stock AS male_parent ON(male_parent_rel.subject_id = male_parent.stock_id AND male_parent.type_id=$accession_cvterm_id)
1933 WHERE accession.stock_id IN ($accession_list_string) AND accession.type_id=$accession_cvterm_id ORDER BY accession.stock_id;";
1934 my $h = $schema->storage->dbh()->prepare($q);
1935 $h->execute();
1936 my @accession_stock_ids_found = ();
1937 my @female_stock_ids_found = ();
1938 my @male_stock_ids_found = ();
1939 while (my ($accession_stock_id, $female_parent_stock_id, $male_parent_stock_id) = $h->fetchrow_array()) {
1940 push @accession_stock_ids_found, $accession_stock_id;
1941 push @female_stock_ids_found, $female_parent_stock_id;
1942 push @male_stock_ids_found, $male_parent_stock_id;
1945 # print STDERR Dumper \@accession_stock_ids_found;
1946 # print STDERR Dumper \@female_stock_ids_found;
1947 # print STDERR Dumper \@male_stock_ids_found;
1949 my $time = DateTime->now();
1950 my $timestamp = $time->ymd()."_".$time->hms();
1952 my @all_protocol_info_lines;
1954 my %unique_germplasm;
1955 my $protocol = CXGN::Genotype::Protocol->new({
1956 bcs_schema => $schema,
1957 nd_protocol_id => $protocol_id,
1958 chromosome_list=>$self->chromosome_list,
1959 start_position=>$self->start_position,
1960 end_position=>$self->end_position,
1961 marker_name_list=>$self->marker_name_list
1963 my $markers = $protocol->markers;
1964 my @all_marker_objects = values %$markers;
1965 my $protocol_name = $protocol->protocol_name();
1967 push @all_protocol_info_lines, @{$protocol->header_information_lines};
1968 push @all_protocol_info_lines, "##source=FILE GENERATED BY BREEDBASE";
1969 push @all_protocol_info_lines, "##fileDate=$timestamp";
1970 push @all_protocol_info_lines, "##Genotyping protocol id=$protocol_id";
1971 push @all_protocol_info_lines, "##Protocol name=$protocol_name";
1973 foreach (@all_marker_objects) {
1974 $self->_filtered_markers()->{$_->{name}}++;
1977 no warnings 'uninitialized';
1978 @all_marker_objects = sort { $a->{chrom} cmp $b->{chrom} || $a->{pos} <=> $b->{pos} || $a->{name} cmp $b->{name} } @all_marker_objects;
1979 @all_marker_objects = $self->_check_filtered_markers(\@all_marker_objects);
1981 my $counter = 0;
1982 for my $i (0..scalar(@accession_stock_ids_found)-1) {
1983 my $female_stock_id = $female_stock_ids_found[$i];
1984 my $male_stock_id = $male_stock_ids_found[$i];
1985 my $accession_stock_id = $accession_stock_ids_found[$i];
1987 my $dataset = CXGN::Dataset::Cache->new({
1988 people_schema=>$self->people_schema,
1989 schema=>$schema,
1990 cache_root=>$cache_root_dir,
1991 accessions=>[$female_stock_id, $male_stock_id]
1993 my $genotypes = $dataset->retrieve_genotypes($protocol_id, ['DS'], ['markers'], ['name', 'chrom', 'pos', 'alt', 'ref'], 1, $self->chromosome_list, $self->start_position, $self->end_position, $self->marker_name_list);
1995 # For old protocols with no protocolprop info...
1996 if (scalar(@all_marker_objects) == 0) {
1997 foreach my $o (sort genosort keys %{$genotypes->[0]->{selected_genotype_hash}}) {
1998 push @all_marker_objects, {name => $o};
2000 @all_marker_objects = $self->_check_filtered_markers(\@all_marker_objects);
2003 if (scalar(@$genotypes)>0) {
2004 my $geno = $genotypes->[0];
2005 $unique_germplasm{$accession_stock_id}++;
2007 my $genotype_string = "";
2008 if ($counter == 0) {
2009 $genotype_string .= "#CHROM\t";
2010 foreach my $m (@all_marker_objects) {
2011 my $chrom = $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{chrom};
2012 if (! $chrom) {
2013 ($chrom) = split /\_/, $m->{name};
2015 $genotype_string .= $chrom . "\t";
2017 $genotype_string .= "\n";
2018 $genotype_string .= "POS\t";
2019 foreach my $m (@all_marker_objects) {
2020 my $pos = $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{pos};
2021 if (! $pos) {
2022 (undef, $pos) = split /\_/, $m->{name};
2024 $genotype_string .= $pos . "\t";
2026 $genotype_string .= "\n";
2027 $genotype_string .= "ID\t";
2028 foreach my $m (@all_marker_objects) {
2029 $genotype_string .= $m->{name} . "\t";
2031 $genotype_string .= "\n";
2032 $genotype_string .= "REF\t";
2033 foreach my $m (@all_marker_objects) {
2034 $genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{ref} . "\t";
2036 $genotype_string .= "\n";
2037 $genotype_string .= "ALT\t";
2038 foreach my $m (@all_marker_objects) {
2039 $genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{alt} . "\t";
2041 $genotype_string .= "\n";
2042 $genotype_string .= "QUAL\t";
2043 foreach my $m (@all_marker_objects) {
2044 $genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{qual} . "\t";
2046 $genotype_string .= "\n";
2047 $genotype_string .= "FILTER\t";
2048 foreach my $m (@all_marker_objects) {
2049 $genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{filter} . "\t";
2051 $genotype_string .= "\n";
2052 $genotype_string .= "INFO\t";
2053 foreach my $m (@all_marker_objects) {
2054 $genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{info} . "\t";
2056 $genotype_string .= "\n";
2057 $genotype_string .= "FORMAT\t";
2058 foreach my $m (@all_marker_objects) {
2059 my $format = 'DS'; #When calculating genotypes from parents, only will return dosages atleast for now
2060 $genotype_string .= $format . "\t";
2061 $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{format} = $format;
2062 $m->{format} = $format;
2064 $genotype_string .= "\n";
2066 my $genotype_id = $geno->{germplasmName};
2067 if (!$self->return_only_first_genotypeprop_for_stock) {
2068 $genotype_id = $geno->{germplasmName}."|".$geno->{markerProfileDbId};
2071 my $geno_h = CXGN::Genotype::ComputeHybridGenotype->new({
2072 parental_genotypes=>$genotypes,
2073 marker_objects=>\@all_marker_objects
2075 my $progeny_genotype = $geno_h->get_hybrid_genotype();
2076 my $genotype_string_scores = join "\t", @$progeny_genotype;
2078 $genotype_string .= $genotype_id."\t".$genotype_string_scores."\n";
2080 write_file($tempfile, {append => 1}, $genotype_string);
2081 $counter++;
2085 my $transpose_tempfile = $tempfile . "_transpose";
2087 my $cmd = CXGN::Tools::Run->new(
2089 backend => $backend_config,
2090 submit_host => $cluster_host_config,
2091 temp_base => $tmp_output_dir,
2092 queue => $web_cluster_queue_config,
2093 do_cleanup => 0,
2094 out_file => $transpose_tempfile,
2095 # out_file => $transpose_tempfile,
2096 # don't block and wait if the cluster looks full
2097 max_cluster_jobs => 1_000_000_000,
2101 # Do the transposition job on the cluster
2102 $cmd->run_cluster(
2103 "perl ",
2104 $basepath_config."/bin/transpose_matrix.pl",
2105 $tempfile,
2107 $cmd->is_cluster(1);
2108 $cmd->wait;
2110 my $transpose_tempfile_hdr = $tempfile . "_transpose_hdr";
2112 open my $in, '<', $transpose_tempfile or die "Can't read input file: $!";
2113 open my $out, '>', $transpose_tempfile_hdr or die "Can't write output file: $!";
2115 #Get synonyms of the accessions
2116 my $stocklookup = CXGN::Stock::StockLookup->new({schema => $self->bcs_schema});
2117 my @accession_ids = keys %unique_germplasm;
2118 my $synonym_hash = $stocklookup->get_stock_synonyms('stock_id', 'accession', \@accession_ids);
2119 my $synonym_string = "##SynonymsOfAccessions=\"";
2120 while( my( $uniquename, $synonym_list ) = each %{$synonym_hash}){
2121 if(scalar(@{$synonym_list})>0){
2122 if(not length($synonym_string)<1){
2123 $synonym_string.=" ";
2125 $synonym_string.=$uniquename."=(";
2126 $synonym_string.= (join ", ", @{$synonym_list}).")";
2129 $synonym_string .= "\"";
2130 push @all_protocol_info_lines, $synonym_string;
2132 my $vcf_header = join "\n", @all_protocol_info_lines;
2133 $vcf_header .= "\n";
2135 print $out $vcf_header;
2137 while( <$in> )
2139 print $out $_;
2141 close $in;
2142 close $out;
2144 open my $out_copy, '<', $transpose_tempfile_hdr or die "Can't open output file: $!";
2146 $self->cache()->set($key, '');
2147 $file_handle = $self->cache()->handle($key);
2148 copy($out_copy, $file_handle);
2150 close $out_copy;
2151 $file_handle = $self->cache()->handle($key);
2153 return $file_handle;
2156 sub genosort {
2157 my ($a_chr, $a_pos, $b_chr, $b_pos);
2158 if ($a =~ m/S(\d+)\_(.*)/) {
2159 $a_chr = $1;
2160 $a_pos = $2;
2162 if ($b =~ m/S(\d+)\_(.*)/) {
2163 $b_chr = $1;
2164 $b_pos = $2;
2167 if ($a_chr && $b_chr) {
2168 if ($a_chr == $b_chr) {
2169 return $a_pos <=> $b_pos;
2171 return $a_chr <=> $b_chr;
2172 } else {
2173 return -1;
2177 sub _check_filtered_markers {
2178 my $self = shift;
2179 my $all_marker_objs = shift;
2180 my @all_marker_objects = @$all_marker_objs;
2181 my @filtered_marker_objects;
2182 if ($self->_filtered_markers && scalar(keys %{$self->_filtered_markers}) > 0) {
2183 my $filtered_markers = $self->_filtered_markers;
2184 foreach (@all_marker_objects) {
2185 if (exists($filtered_markers->{$_->{name}})) {
2186 push @filtered_marker_objects, $_;
2189 @all_marker_objects = @filtered_marker_objects;
2191 return @all_marker_objects;
2195 sub get_pcr_genotype_info {
2196 my $self = shift;
2197 my $schema = $self->bcs_schema;
2198 my $genotype_data_project_list = $self->genotype_data_project_list;
2199 my $protocol_id_list = $self->protocol_id_list;
2200 my $genotype_id_list = $self->markerprofile_id_list;
2201 my $protocol_id = $protocol_id_list->[0];
2202 my @where_clause = ();
2203 # print STDERR "GENOTYPE ID =".Dumper($genotype_id_list)."\n";
2204 # print STDERR "PROTOCOL ID =".Dumper($protocol_id_list)."\n";
2206 if ($genotype_data_project_list && scalar(@$genotype_data_project_list)>0) {
2207 my $sql = join ("," , @$genotype_data_project_list);
2208 push @where_clause, "nd_experiment_project.project_id in ($sql)";
2211 if ($protocol_id_list && scalar(@$protocol_id_list)>0) {
2212 my $query = join ("," , @$protocol_id_list);
2213 push @where_clause, "nd_protocol.nd_protocol_id in ($query)";
2216 if ($genotype_id_list && scalar(@$genotype_id_list)>0) {
2217 my $query = join ("," , @$genotype_id_list);
2218 push @where_clause, "genotype.genotype_id in ($query)";
2220 # print STDERR "WHERE CLAUSE =".Dumper(\@where_clause)."\n";
2221 my $where_clause = scalar(@where_clause)>0 ? " WHERE " . (join (" AND " , @where_clause)) : '';
2223 my $pcr_genotyping_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'pcr_marker_genotyping', 'genotype_property')->cvterm_id();
2224 my $pcr_protocolprop_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'pcr_marker_details', 'protocol_property')->cvterm_id();
2225 my $pcr_protocol_type_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'pcr_marker_protocol', 'protocol_type')->cvterm_id();
2226 my $ploidy_level_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, "ploidy_level", "stock_property")->cvterm_id();
2228 my $q = "SELECT stock.stock_id, stock.uniquename, stockprop.value, cvterm.name, genotype.genotype_id, genotype.description, genotypeprop.value, nd_protocol.nd_protocol_id
2229 FROM nd_protocol
2230 JOIN nd_experiment_protocol ON (nd_protocol.nd_protocol_id = nd_experiment_protocol.nd_protocol_id)
2231 JOIN nd_experiment_genotype ON (nd_experiment_protocol.nd_experiment_id = nd_experiment_genotype.nd_experiment_id)
2232 JOIN nd_experiment_project ON (nd_experiment_protocol.nd_experiment_id = nd_experiment_project.nd_experiment_id)
2233 JOIN genotype ON (nd_experiment_genotype.genotype_id = genotype.genotype_id)
2234 JOIN genotypeprop ON (genotype.genotype_id = genotypeprop.genotype_id) AND genotypeprop.type_id = ?
2235 JOIN nd_experiment_stock ON (nd_experiment_genotype.nd_experiment_id = nd_experiment_stock.nd_experiment_id)
2236 JOIN stock ON (nd_experiment_stock.stock_id = stock.stock_id)
2237 LEFT JOIN stockprop ON (stockprop.stock_id = stock.stock_id) AND stockprop.type_id = ?
2238 JOIN cvterm ON (stock.type_id = cvterm.cvterm_id)
2239 $where_clause ORDER BY stock.uniquename ASC";
2241 my $h = $schema->storage->dbh()->prepare($q);
2242 $h->execute($pcr_genotyping_cvterm_id, $ploidy_level_cvterm_id);
2244 my @pcr_genotype_data = ();
2245 while (my ($stock_id, $stock_name, $ploidy_level, $stock_type, $genotype_id, $genotype_description, $genotype_data, $nd_protocol_id) = $h->fetchrow_array()){
2246 push @pcr_genotype_data, [$stock_id, $stock_name, $stock_type, $ploidy_level, $genotype_description, $genotype_id, $genotype_data, $nd_protocol_id]
2249 if (!defined $protocol_id) {
2250 $protocol_id = $pcr_genotype_data[0][7];
2253 my $q1 = "SELECT nd_protocolprop.value->>'marker_names' FROM nd_protocolprop WHERE nd_protocol_id = ? AND nd_protocolprop.type_id = ?";
2254 my $h1 = $schema->storage->dbh()->prepare($q1);
2255 $h1->execute($protocol_id, $pcr_protocolprop_cvterm_id);
2256 my ($marker_names) = $h1->fetchrow_array();
2258 my %ssr_genotype_data = (
2259 marker_names => $marker_names,
2260 ssr_genotype_data => \@pcr_genotype_data
2263 # print STDERR "PCR GENOTYPE INFO =".Dumper(\%ssr_genotype_data)."\n";
2265 return \%ssr_genotype_data;
2269 sub search_vcf_genotyping_cvterm_id {
2270 my $self = shift;
2271 my $search_param = shift;
2272 my $schema = $self->bcs_schema;
2274 my $vcf_genotyping_cvterm_id;
2275 if (defined($search_param->{protocol_id}) || defined($search_param->{genotype_id} )) {
2276 $vcf_genotyping_cvterm_id = SGN::Model::Cvterm->get_vcf_genotyping_cvterm_id($schema, $search_param);
2277 } else {
2278 my $vcf_genotyping_type = $search_param->{vcf_genotyping_type} || 'vcf_snp_genotyping';
2279 $vcf_genotyping_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, $vcf_genotyping_type, 'genotype_property')->cvterm_id();
2282 return $vcf_genotyping_cvterm_id;