fix typo.
[sgn.git] / Search.pm
bloba761b270aa148b0ec4d68081e0fdcba0f852a390
1 package CXGN::Genotype::Search;
3 =head1 NAME
5 CXGN::Genotype::Search - an object to handle searching genotypes for stocks
7 =head1 USAGE
9 PLEASE BE AWARE THAT THE DEFAULT OPTIONS FOR genotypeprop_hash_select, protocolprop_top_key_select, protocolprop_marker_hash_select ARE PRONE TO EXCEEDING THE MEMORY LIMITS OF VM. CHECK THE MOOSE ATTRIBUTES BELOW TO SEE THE DEFAULTS, AND ADJUST YOUR MOOSE INSTANTIATION ACCORDINGLY
11 my $genotypes_search = CXGN::Genotype::Search->new({
12 bcs_schema=>$schema,
13 people_schema=>$people_schema,
14 accession_list=>$accession_list,
15 tissue_sample_list=>$tissue_sample_list,
16 trial_list=>$trial_list,
17 protocol_id_list=>$protocol_id_list,
18 markerprofile_id_list=>$markerprofile_id_list,
19 genotype_data_project_list=>$genotype_data_project_list,
20 chromosome_list=>\@chromosome_numbers,
21 start_position=>$start_position,
22 end_position=>$end_position,
23 marker_name_list=>['S80_265728', 'S80_265723'],
24 genotypeprop_hash_select=>['DS', 'GT', 'DP'], #THESE ARE THE KEYS IN THE GENOTYPEPROP OBJECT
25 protocolprop_top_key_select=>['reference_genome_name', 'header_information_lines', 'marker_names', 'markers'], #THESE ARE THE KEYS AT THE TOP LEVEL OF THE PROTOCOLPROP OBJECT
26 protocolprop_marker_hash_select=>['name', 'chrom', 'pos', 'alt', 'ref'], #THESE ARE THE KEYS IN THE MARKERS OBJECT IN THE PROTOCOLPROP OBJECT
27 return_only_first_genotypeprop_for_stock=>0, #THIS IS TO CONSERVE MEMORY USAGE
28 limit=>$limit,
29 offset=>$offset,
30 forbid_cache=>$forbid_cache
31 # marker_search_hash_list=>[{'S80_265728' => {'pos' => '265728', 'chrom' => '1'}}], NOT IMPLEMENTED
32 # marker_score_search_hash_list=>[{'S80_265728' => {'GT' => '0/0', 'GQ' => '99'}}], NOT IMPLEMENTED
33 });
34 # my ($total_count, $genotypes) = $genotypes_search->get_genotype_info();
36 # RECOMMENDED
37 If you just want to get a file with the genotype result in a dosage matrix or VCF file, use get_cached_file_dosage_matrix or get_cached_file_VCF functions instead.
38 If you want results in json format use get_cached_file_search_json
39 If you want results in json format for only the metadata (no genotype call data), use get_cached_file_search_json()
41 =head1 DESCRIPTION
44 =head1 AUTHORS
46 Nicolas Morales <nm529@cornell.edu>
47 With code moved from CXGN::BreederSearch
48 Lukas Mueller <lam87@cornell.edu>
49 Aimin Yan <ay247@cornell.edu>
51 =cut
53 use strict;
54 use warnings;
55 use Moose;
56 use Try::Tiny;
57 use Data::Dumper;
58 use SGN::Model::Cvterm;
59 use CXGN::Trial;
60 use JSON;
61 use CXGN::Stock::Accession;
62 use CXGN::Genotype::Protocol;
63 use CXGN::Genotype::ComputeHybridGenotype;
64 use Cache::File;
65 use Digest::MD5 qw | md5_hex |;
66 use File::Slurp qw | write_file |;
67 use File::Temp qw | tempfile |;
68 use File::Copy;
69 use POSIX;
71 has 'bcs_schema' => (
72 isa => 'Bio::Chado::Schema',
73 is => 'rw',
74 required => 1
77 has 'people_schema' => (
78 isa => 'CXGN::People::Schema',
79 is => 'rw',
80 required => 1
83 has 'protocol_id_list' => (
84 isa => 'ArrayRef[Int]|Undef',
85 is => 'rw',
88 has 'markerprofile_id_list' => (
89 isa => 'ArrayRef[Int]|Undef',
90 is => 'ro',
93 has 'accession_list' => (
94 isa => 'ArrayRef[Int]|Undef',
95 is => 'ro',
98 has 'tissue_sample_list' => (
99 isa => 'ArrayRef[Int]|Undef',
100 is => 'ro',
103 has 'trial_list' => (
104 isa => 'ArrayRef[Int]|Undef',
105 is => 'ro',
108 has 'genotype_data_project_list' => (
109 isa => 'ArrayRef[Int]|Undef',
110 is => 'ro',
113 has 'chromosome_list' => (
114 isa => 'ArrayRef[Int]|ArrayRef[Str]|Undef',
115 is => 'ro',
118 has 'start_position' => (
119 isa => 'Int|Undef',
120 is => 'ro',
123 has 'end_position' => (
124 isa => 'Int|Undef',
125 is => 'ro',
128 has 'marker_name_list' => (
129 isa => 'ArrayRef[Str]|Undef',
130 is => 'ro',
133 has 'genotypeprop_hash_select' => (
134 isa => 'ArrayRef[Str]',
135 is => 'ro',
136 default => sub {['GT', 'AD', 'DP', 'GQ', 'DS', 'PL', 'NT']} #THESE ARE THE GENERIC AND EXPECTED VCF ATRRIBUTES
139 has 'protocolprop_top_key_select' => (
140 isa => 'ArrayRef[Str]',
141 is => 'ro',
142 default => sub {['reference_genome_name', 'species_name', 'header_information_lines', 'sample_observation_unit_type_name', 'marker_names', 'markers', 'markers_array']} #THESE ARE ALL POSSIBLE TOP LEVEL KEYS IN PROTOCOLPROP BASED ON VCF LOADING
145 has 'protocolprop_marker_hash_select' => (
146 isa => 'ArrayRef[Str]',
147 is => 'ro',
148 default => sub {['name', 'chrom', 'pos', 'alt', 'ref', 'qual', 'filter', 'info', 'format']} #THESE ARE ALL POSSIBLE PROTOCOLPROP MARKER HASH KEYS BASED ON VCF LOADING
151 has 'return_only_first_genotypeprop_for_stock' => (
152 isa => 'Bool',
153 is => 'ro',
154 default => 0
157 # When using the get_cached_file_dosage_matrix or get_cached_file_VCF functions, need the following three keys.
158 has 'cache_root' => (
159 isa => 'Str',
160 is => 'rw',
163 has 'cache' => (
164 isa => 'Cache::File',
165 is => 'rw',
168 has 'cache_expiry' => (
169 isa => 'Int',
170 is => 'rw',
171 default => 0, # never expires?
174 has 'forbid_cache' => (
175 isa => 'Bool',
176 is => 'ro',
177 default => 0
180 has 'prevent_transpose' => (
181 isa => 'Bool',
182 is => 'ro',
183 default => 0
186 has '_iterator_query_handle' => (
187 isa => 'Ref',
188 is => 'rw'
191 has '_iterator_genotypeprop_query_handle' => (
192 isa => 'Ref',
193 is => 'rw'
196 has '_filtered_markers' => (
197 isa => 'HashRef',
198 is => 'rw',
199 default => sub {{}}
202 has '_snp_genotyping_cvterm_id' => (
203 isa => 'Int',
204 is => 'rw'
207 has '_vcf_snp_genotyping_cvterm_id' => (
208 isa => 'Int',
209 is => 'rw'
212 has '_vcf_map_details_cvterm_id' => (
213 isa => 'Int',
214 is => 'rw'
217 has '_vcf_map_details_markers_cvterm_id' => (
218 isa => 'Int',
219 is => 'rw'
222 has '_vcf_map_details_markers_array_cvterm_id' => (
223 isa => 'Int',
224 is => 'rw'
227 has '_igd_genotypeprop_cvterm_id' => (
228 isa => 'Int',
229 is => 'rw'
232 has '_accession_cvterm_id' => (
233 isa => 'Int',
234 is => 'rw'
237 has '_tissue_sample_cvterm_id' => (
238 isa => 'Int',
239 is => 'rw'
242 has '_plot_cvterm_id' => (
243 isa => 'Int',
244 is => 'rw'
247 has '_plant_cvterm_id' => (
248 isa => 'Int',
249 is => 'rw'
252 has '_tissue_sample_of_cvterm_id' => (
253 isa => 'Int',
254 is => 'rw'
257 has '_protocolprop_markers_h' => (
258 isa => 'Ref',
259 is => 'rw'
262 has '_protocolprop_top_key_h' => (
263 isa => 'Ref',
264 is => 'rw'
267 has '_protocolprop_top_key_markers_h' => (
268 isa => 'Ref',
269 is => 'rw'
272 has '_protocolprop_top_key_markers_array_h' => (
273 isa => 'Ref',
274 is => 'rw'
277 has '_genotypeprop_h' => (
278 isa => 'Ref',
279 is => 'rw'
282 has '_protocolprop_marker_hash_select_arr' => (
283 isa => 'ArrayRef',
284 is => 'rw'
287 has '_protocolprop_top_key_select_arr' => (
288 isa => 'ArrayRef',
289 is => 'rw'
292 has '_selected_protocol_marker_info' => (
293 isa => 'Ref',
294 is => 'rw'
297 has '_selected_protocol_top_key_info' => (
298 isa => 'Ref',
299 is => 'rw'
302 has '_genotypeprop_infos' => (
303 isa => 'ArrayRef',
304 is => 'rw'
307 has '_genotypeprop_infos_counter' => (
308 isa => 'Int',
309 is => 'rw'
312 has '_genotypeprop_hash_select_arr' => (
313 isa => 'ArrayRef',
314 is => 'rw'
317 #NOT IMPLEMENTED
318 has 'marker_search_hash_list' => (
319 isa => 'ArrayRef[HashRef]|Undef',
320 is => 'ro',
323 #NOT IMPLEMENTED
324 has 'marker_score_search_hash_list' => (
325 isa => 'ArrayRef[HashRef]|Undef',
326 is => 'ro',
329 has 'limit' => (
330 isa => 'Int|Undef',
331 is => 'rw',
334 has 'offset' => (
335 isa => 'Int|Undef',
336 is => 'rw',
339 =head2 get_genotype_info
341 returns: an array with genotype information
343 =cut
345 =head2 get_genotype_info()
347 Function for getting genotype data iteratively.
348 Should be used like:
350 my $genotype_search = CXGN::Genotype::Search->new({
351 bcs_schema=>$schema,
352 etc...
354 my ($count, $genotype_data) = $genotype_search->get_genotype_info();
356 If you want to get results iteratively, use the init and iterator function defined below instead. Iterative retrieval minimizes memory load.
358 If you just want to get a file with the genotype result in a dosage matrix or VCF file, use get_cached_file_dosage_matrix or get_cached_file_VCF functions instead.
359 If you want results in json format use get_cached_file_search_json
360 If you want results in json format for only the metadata (no genotype call data), use get_cached_file_search_json()
362 =cut
364 sub get_genotype_info {
365 my $self = shift;
366 my $schema = $self->bcs_schema;
367 my $trial_list = $self->trial_list;
368 my $genotype_data_project_list = $self->genotype_data_project_list;
369 my $protocol_id_list = $self->protocol_id_list;
370 my $markerprofile_id_list = $self->markerprofile_id_list;
371 my $accession_list = $self->accession_list;
372 my $tissue_sample_list = $self->tissue_sample_list;
373 my $marker_name_list = $self->marker_name_list;
374 my $chromosome_list = $self->chromosome_list;
375 my $start_position = $self->start_position;
376 my $end_position = $self->end_position;
377 my $genotypeprop_hash_select = $self->genotypeprop_hash_select;
378 my $protocolprop_top_key_select = $self->protocolprop_top_key_select;
379 my $protocolprop_marker_hash_select = $self->protocolprop_marker_hash_select;
380 my $marker_search_hash_list = $self->marker_search_hash_list;
381 my $marker_score_search_hash_list = $self->marker_score_search_hash_list;
382 my $return_only_first_genotypeprop_for_stock = $self->return_only_first_genotypeprop_for_stock;
383 my $limit = $self->limit;
384 my $offset = $self->offset;
385 my @data;
386 my %search_params;
387 my @where_clause;
389 my $snp_genotyping_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'snp genotyping', 'genotype_property')->cvterm_id();
390 my $vcf_snp_genotyping_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'vcf_snp_genotyping', 'genotype_property')->cvterm_id();
391 my $vcf_map_details_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'vcf_map_details', 'protocol_property')->cvterm_id();
392 my $vcf_map_details_markers_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'vcf_map_details_markers', 'protocol_property')->cvterm_id();
393 my $vcf_map_details_markers_array_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'vcf_map_details_markers_array', 'protocol_property')->cvterm_id();
394 my $igd_genotypeprop_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'igd number', 'genotype_property')->cvterm_id();
395 my $accession_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'accession', 'stock_type')->cvterm_id();
396 my $tissue_sample_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'tissue_sample', 'stock_type')->cvterm_id();
397 my $plot_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'plot', 'stock_type')->cvterm_id();
398 my $plant_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'plant', 'stock_type')->cvterm_id();
399 my $tissue_sample_of_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'tissue_sample_of', 'stock_relationship')->cvterm_id();
401 my @trials_accessions;
402 foreach (@$trial_list){
403 my $trial = CXGN::Trial->new({bcs_schema=>$schema, trial_id=>$_});
404 my $accessions = $trial->get_accessions();
405 foreach (@$accessions){
406 push @trials_accessions, $_->{stock_id};
410 #If accessions are explicitly given, then accessions found from trials will not be added to the search.
411 if (!$accession_list || scalar(@$accession_list)==0) {
412 push @$accession_list, @trials_accessions;
415 #For projects inserted into database during the addition of genotypes and genotypeprops
416 if (scalar(@trials_accessions)==0){
417 if ($trial_list && scalar(@$trial_list)>0) {
418 my $trial_sql = join ("," , @$trial_list);
419 push @where_clause, "project.project_id in ($trial_sql)";
423 #For genotyping_data_project
424 if ($genotype_data_project_list && scalar($genotype_data_project_list)>0) {
425 my $sql = join ("," , @$genotype_data_project_list);
426 push @where_clause, "project.project_id in ($sql)";
428 if ($protocol_id_list && scalar(@$protocol_id_list)>0) {
429 my $protocol_sql = join ("," , @$protocol_id_list);
430 push @where_clause, "nd_protocol.nd_protocol_id in ($protocol_sql)";
432 if ($accession_list && scalar(@$accession_list)>0) {
433 my $accession_sql = join ("," , @$accession_list);
434 push @where_clause, " ( stock.stock_id in ($accession_sql) OR (accession_of_tissue_sample.stock_id in ($accession_sql) AND accession_of_tissue_sample.type_id = $accession_cvterm_id) ) ";
435 push @where_clause, "stock.type_id in ($accession_cvterm_id, $tissue_sample_cvterm_id)";
437 if ($tissue_sample_list && scalar(@$tissue_sample_list)>0) {
438 my $stock_sql = join ("," , @$tissue_sample_list);
439 push @where_clause, "stock.stock_id in ($stock_sql)";
440 push @where_clause, "stock.type_id = $tissue_sample_cvterm_id";
442 if ($markerprofile_id_list && scalar(@$markerprofile_id_list)>0) {
443 my $markerprofile_sql = join ("," , @$markerprofile_id_list);
444 push @where_clause, "genotype.genotype_id in ($markerprofile_sql)";
446 if ($marker_name_list && scalar(@$marker_name_list)>0) {
447 my $search_vals_sql = "'".join ("','" , @$marker_name_list)."'";
448 push @where_clause, "nd_protocolprop.value->'marker_names' \\?& array[$search_vals_sql]";
450 if ($marker_search_hash_list && scalar(@$marker_search_hash_list)>0) {
451 foreach (@$marker_search_hash_list){
452 my $json_val = encode_json $_;
453 push @where_clause, "nd_protocolprop.value->'markers' \\@> $json_val"."::jsonb";
456 if ($marker_score_search_hash_list && scalar(@$marker_score_search_hash_list)>0) {
457 foreach (@$marker_score_search_hash_list){
458 my $json_val = encode_json $_;
459 push @where_clause, "genotype_values.value \\@> $json_val"."::jsonb";
463 my $where_clause = scalar(@where_clause)>0 ? " WHERE " . (join (" AND " , @where_clause)) : '';
465 my $offset_clause = '';
466 my $limit_clause = '';
467 if ($limit){
468 $limit_clause = " LIMIT $limit ";
470 if ($offset){
471 $offset_clause = " OFFSET $offset ";
474 my $stock_select = '';
475 if ($return_only_first_genotypeprop_for_stock) {
476 $stock_select = 'distinct on (stock.stock_id) stock.stock_id';
477 } else {
478 $stock_select = 'stock.stock_id';
481 my $q = "SELECT $stock_select, igd_number_genotypeprop.value, nd_protocol.nd_protocol_id, nd_protocol.name, stock.uniquename, stock.type_id, stock_cvterm.name, genotype.genotype_id, genotype.uniquename, genotype.description, project.project_id, project.name, project.description, accession_of_tissue_sample.stock_id, accession_of_tissue_sample.uniquename, count(genotype.genotype_id) OVER() AS full_count
482 FROM stock
483 JOIN cvterm AS stock_cvterm ON(stock.type_id = stock_cvterm.cvterm_id)
484 LEFT JOIN stock_relationship ON(stock_relationship.subject_id=stock.stock_id AND stock_relationship.type_id = $tissue_sample_of_cvterm_id)
485 LEFT JOIN stock AS accession_of_tissue_sample ON(stock_relationship.object_id=accession_of_tissue_sample.stock_id)
486 JOIN nd_experiment_stock ON(stock.stock_id=nd_experiment_stock.stock_id)
487 JOIN nd_experiment USING(nd_experiment_id)
488 JOIN nd_experiment_protocol USING(nd_experiment_id)
489 JOIN nd_experiment_project USING(nd_experiment_id)
490 JOIN nd_experiment_genotype USING(nd_experiment_id)
491 JOIN nd_protocol USING(nd_protocol_id)
492 LEFT JOIN nd_protocolprop ON(nd_protocolprop.nd_protocol_id = nd_protocol.nd_protocol_id AND nd_protocolprop.type_id = $vcf_map_details_cvterm_id)
493 JOIN genotype USING(genotype_id)
494 LEFT JOIN genotypeprop AS igd_number_genotypeprop ON(igd_number_genotypeprop.genotype_id = genotype.genotype_id AND igd_number_genotypeprop.type_id = $igd_genotypeprop_cvterm_id)
495 JOIN project USING(project_id)
496 $where_clause
497 ORDER BY stock.stock_id, genotype.genotype_id ASC
498 $limit_clause
499 $offset_clause;";
501 #print STDERR Dumper $q;
502 my $h = $schema->storage->dbh()->prepare($q);
503 $h->execute();
505 my $total_count = 0;
506 my @genotype_id_array;
507 my @genotypeprop_array;
508 my %genotype_hash;
509 my %genotypeprop_hash;
510 my %protocolprop_hash;
511 while (my ($stock_id, $igd_number_json, $protocol_id, $protocol_name, $stock_name, $stock_type_id, $stock_type_name, $genotype_id, $genotype_uniquename, $genotype_description, $project_id, $project_name, $project_description, $accession_id, $accession_uniquename, $full_count) = $h->fetchrow_array()) {
512 my $igd_number_hash = $igd_number_json ? decode_json $igd_number_json : undef;
513 my $igd_number = $igd_number_hash ? $igd_number_hash->{'igd number'} : undef;
514 $igd_number = !$igd_number && $igd_number_hash ? $igd_number_hash->{'igd_number'} : undef;
516 my $germplasmName = '';
517 my $germplasmDbId = '';
518 if ($stock_type_name eq 'accession'){
519 $germplasmName = $stock_name;
520 $germplasmDbId = $stock_id;
522 if ($stock_type_name eq 'tissue_sample'){
523 $germplasmName = $accession_uniquename;
524 $germplasmDbId = $accession_id;
527 my $stock_object = CXGN::Stock::Accession->new({schema=>$self->bcs_schema, stock_id=>$germplasmDbId});
529 push @genotype_id_array, $genotype_id;
531 $genotype_hash{$genotype_id} = {
532 markerProfileDbId => $genotype_id,
533 germplasmDbId => $germplasmDbId,
534 germplasmName => $germplasmName,
535 synonyms => $stock_object->synonyms,
536 stock_id => $stock_id,
537 stock_name => $stock_name,
538 stock_type_id => $stock_type_id,
539 stock_type_name => $stock_type_name,
540 genotypeDbId => $genotype_id,
541 genotypeUniquename => $genotype_uniquename,
542 genotypeDescription => $genotype_description,
543 analysisMethodDbId => $protocol_id,
544 analysisMethod => $protocol_name,
545 genotypingDataProjectDbId => $project_id,
546 genotypingDataProjectName => $project_name,
547 genotypingDataProjectDescription => $project_description,
548 igd_number => $igd_number,
550 $protocolprop_hash{$protocol_id}++;
551 $total_count = $full_count;
553 print STDERR "CXGN::Genotype::Search has genotype_ids $total_count\n";
555 my @found_protocolprop_ids = keys %protocolprop_hash;
556 my @protocolprop_marker_hash_select_arr;
557 foreach (@$protocolprop_marker_hash_select){
558 push @protocolprop_marker_hash_select_arr, "s.value->>'$_'";
560 my @protocolprop_top_key_select_arr;
561 my %protocolprop_top_key_select_hash;
562 foreach (@$protocolprop_top_key_select){
563 if ($_ ne 'markers' && $_ ne 'markers_array') {
564 push @protocolprop_top_key_select_arr, "value->>'$_'";
566 $protocolprop_top_key_select_hash{$_}++;
568 my %selected_protocol_marker_info;
569 my %selected_protocol_top_key_info;
570 my %filtered_markers;
571 my $genotypeprop_chromosome_rank_string = '';
572 if (scalar(@found_protocolprop_ids)>0){
573 my $protocolprop_id_sql = join ("," , @found_protocolprop_ids);
574 my $protocolprop_where_sql = "nd_protocol_id in ($protocolprop_id_sql) and type_id = $vcf_map_details_cvterm_id";
575 my $protocolprop_where_markers_sql = "nd_protocol_id in ($protocolprop_id_sql) and type_id = $vcf_map_details_markers_cvterm_id";
576 my $protocolprop_where_markers_array_sql = "nd_protocol_id in ($protocolprop_id_sql) and type_id = $vcf_map_details_markers_array_cvterm_id";
577 my $protocolprop_hash_select_sql = scalar(@protocolprop_marker_hash_select_arr) > 0 ? ', '.join ',', @protocolprop_marker_hash_select_arr : '';
579 my $chromosome_where = '';
580 if ($chromosome_list && scalar(@$chromosome_list)>0) {
581 my $chromosome_list_sql = '\'' . join('\', \'', @$chromosome_list) . '\'';
582 $chromosome_where = " AND (s.value->>'chrom')::text IN ($chromosome_list_sql)";
583 #$genotypeprop_chromosome_rank_string = " AND value->>'CHROM' IN ($chromosome_list_sql) ";
585 my $start_position_where = '';
586 if (defined($start_position)) {
587 $start_position_where = " AND (s.value->>'pos')::int >= $start_position";
589 my $end_position_where = '';
590 if (defined($end_position)) {
591 $end_position_where = " AND (s.value->>'pos')::int <= $end_position";
594 my $protocolprop_q = "SELECT nd_protocol_id, s.key $protocolprop_hash_select_sql
595 FROM nd_protocolprop, jsonb_each(nd_protocolprop.value) as s
596 WHERE $protocolprop_where_markers_sql $chromosome_where $start_position_where $end_position_where;";
598 my $protocolprop_h = $schema->storage->dbh()->prepare($protocolprop_q);
599 $protocolprop_h->execute();
600 while (my ($protocol_id, $marker_name, @protocolprop_info_return) = $protocolprop_h->fetchrow_array()) {
601 for my $s (0 .. scalar(@protocolprop_marker_hash_select_arr)-1){
602 $selected_protocol_marker_info{$protocol_id}->{$marker_name}->{$protocolprop_marker_hash_select->[$s]} = $protocolprop_info_return[$s];
604 $filtered_markers{$marker_name}++;
606 my $protocolprop_top_key_select_sql = scalar(@protocolprop_top_key_select_arr) > 0 ? ', '.join ',', @protocolprop_top_key_select_arr : '';
607 my $protocolprop_top_key_q = "SELECT nd_protocol_id $protocolprop_top_key_select_sql from nd_protocolprop WHERE $protocolprop_where_sql;";
608 my $protocolprop_top_key_h = $schema->storage->dbh()->prepare($protocolprop_top_key_q);
609 $protocolprop_top_key_h->execute();
610 while (my ($protocol_id, @protocolprop_top_key_return) = $protocolprop_top_key_h->fetchrow_array()) {
611 for my $s (0 .. scalar(@protocolprop_top_key_select_arr)-1){
612 my $protocolprop_i = $protocolprop_top_key_select->[$s];
613 my $val;
614 if ($protocolprop_i eq 'header_information_lines' || $protocolprop_i eq 'marker_names') {
615 $val = decode_json $protocolprop_top_key_return[$s];
616 } else {
617 $val = $protocolprop_top_key_return[$s];
619 $selected_protocol_top_key_info{$protocol_id}->{$protocolprop_i} = $val;
622 if (exists($protocolprop_top_key_select_hash{'markers'})) {
623 my $protocolprop_top_key_q = "SELECT nd_protocol_id, value from nd_protocolprop WHERE $protocolprop_where_markers_sql;";
624 my $protocolprop_top_key_h = $schema->storage->dbh()->prepare($protocolprop_top_key_q);
625 $protocolprop_top_key_h->execute();
626 while (my ($protocol_id, $markers_value) = $protocolprop_top_key_h->fetchrow_array()) {
627 $selected_protocol_top_key_info{$protocol_id}->{'markers'} = decode_json $markers_value;
630 if (exists($protocolprop_top_key_select_hash{'markers_array'})) {
631 my $protocolprop_top_key_q = "SELECT nd_protocol_id, value from nd_protocolprop WHERE $protocolprop_where_markers_array_sql;";
632 my $protocolprop_top_key_h = $schema->storage->dbh()->prepare($protocolprop_top_key_q);
633 $protocolprop_top_key_h->execute();
634 while (my ($protocol_id, $markers_value) = $protocolprop_top_key_h->fetchrow_array()) {
635 $selected_protocol_top_key_info{$protocol_id}->{'markers_array'} = decode_json $markers_value;
640 my @genotypeprop_hash_select_arr;
641 foreach (@$genotypeprop_hash_select){
642 push @genotypeprop_hash_select_arr, "s.value->>'$_'";
644 if (scalar(@genotype_id_array)>0) {
645 my $genotypeprop_id_sql = join ("," , @genotype_id_array);
646 my $genotypeprop_hash_select_sql = scalar(@genotypeprop_hash_select_arr) > 0 ? ', '.join ',', @genotypeprop_hash_select_arr : '';
648 my $filtered_markers_sql = '';
649 if (scalar(keys %filtered_markers) >0 && scalar(keys %filtered_markers) < 10000) {
650 $filtered_markers_sql = " AND s.key IN ('". join ("','", keys %filtered_markers) ."')";
653 my $q2 = "SELECT genotypeprop_id
654 FROM genotypeprop WHERE genotype_id = ? AND type_id=$vcf_snp_genotyping_cvterm_id $genotypeprop_chromosome_rank_string;";
655 my $h2 = $schema->storage->dbh()->prepare($q2);
657 my $genotypeprop_q = "SELECT s.key $genotypeprop_hash_select_sql
658 FROM genotypeprop, jsonb_each(genotypeprop.value) as s
659 WHERE genotypeprop_id = ? AND s.key != 'CHROM' AND type_id = $vcf_snp_genotyping_cvterm_id $filtered_markers_sql;";
660 my $genotypeprop_h = $schema->storage->dbh()->prepare($genotypeprop_q);
662 foreach my $genotype_id (@genotype_id_array){
663 $h2->execute($genotype_id);
664 while (my ($genotypeprop_id) = $h2->fetchrow_array()) {
665 $genotypeprop_h->execute($genotypeprop_id);
666 while (my ($marker_name, @genotypeprop_info_return) = $genotypeprop_h->fetchrow_array()) {
667 for my $s (0 .. scalar(@genotypeprop_hash_select_arr)-1){
668 $genotype_hash{$genotype_id}->{selected_genotype_hash}->{$marker_name}->{$genotypeprop_hash_select->[$s]} = $genotypeprop_info_return[$s];
675 foreach (@genotype_id_array) {
676 my $info = $genotype_hash{$_};
677 my $selected_marker_info = $selected_protocol_marker_info{$info->{analysisMethodDbId}} ? $selected_protocol_marker_info{$info->{analysisMethodDbId}} : {};
678 my $selected_protocol_info = $selected_protocol_top_key_info{$info->{analysisMethodDbId}} ? $selected_protocol_top_key_info{$info->{analysisMethodDbId}} : {};
679 my @all_protocol_marker_names = keys %$selected_marker_info;
680 $selected_protocol_info->{markers} = $selected_marker_info;
681 $info->{resultCount} = scalar(keys %{$info->{selected_genotype_hash}});
682 $info->{all_protocol_marker_names} = \@all_protocol_marker_names;
683 $info->{selected_protocol_hash} = $selected_protocol_info;
684 push @data, $info;
687 #print STDERR Dumper \@data;
688 return ($total_count, \@data);
691 =head2 init_genotype_iterator()
693 Function for initiating genotype search query and then used to get genotype data iteratively. Iterative search retrieval minimizes memory usage.
694 Should be used like:
696 my $genotype_search = CXGN::Genotype::Search->new({
697 bcs_schema=>$schema,
698 etc...
700 $genotype_search->init_genotype_iterator();
701 while (my ($count, $genotype_data) = $genotype_search->get_next_genotype_info) {
702 #Do something with genotype data
705 If you just want to get a file with the genotype result in a dosage matrix or VCF file, use get_cached_file_dosage_matrix or get_cached_file_VCF functions instead.
706 If you want results in json format use get_cached_file_search_json
707 If you want results in json format for only the metadata (no genotype call data), use get_cached_file_search_json()
709 =cut
711 sub init_genotype_iterator {
712 my $self = shift;
713 my $schema = $self->bcs_schema;
714 my $trial_list = $self->trial_list;
715 my $genotype_data_project_list = $self->genotype_data_project_list;
716 my $protocol_id_list = $self->protocol_id_list;
717 my $markerprofile_id_list = $self->markerprofile_id_list;
718 my $accession_list = $self->accession_list;
719 my $tissue_sample_list = $self->tissue_sample_list;
720 my $marker_name_list = $self->marker_name_list;
721 my $chromosome_list = $self->chromosome_list;
722 my $start_position = $self->start_position;
723 my $end_position = $self->end_position;
724 my $genotypeprop_hash_select = $self->genotypeprop_hash_select;
725 my $protocolprop_top_key_select = $self->protocolprop_top_key_select;
726 my $protocolprop_marker_hash_select = $self->protocolprop_marker_hash_select;
727 my $marker_search_hash_list = $self->marker_search_hash_list;
728 my $marker_score_search_hash_list = $self->marker_score_search_hash_list;
729 my $return_only_first_genotypeprop_for_stock = $self->return_only_first_genotypeprop_for_stock;
730 my $limit = $self->limit;
731 my $offset = $self->offset;
732 my @data;
733 my %search_params;
734 my @where_clause;
736 my $snp_genotyping_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'snp genotyping', 'genotype_property')->cvterm_id();
737 $self->_snp_genotyping_cvterm_id($snp_genotyping_cvterm_id);
738 my $vcf_snp_genotyping_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'vcf_snp_genotyping', 'genotype_property')->cvterm_id();
739 $self->_vcf_snp_genotyping_cvterm_id($vcf_snp_genotyping_cvterm_id);
740 my $vcf_map_details_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'vcf_map_details', 'protocol_property')->cvterm_id();
741 $self->_vcf_map_details_cvterm_id($vcf_map_details_cvterm_id);
742 my $vcf_map_details_markers_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'vcf_map_details_markers', 'protocol_property')->cvterm_id();
743 $self->_vcf_map_details_markers_cvterm_id($vcf_map_details_markers_cvterm_id);
744 my $vcf_map_details_markers_array_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'vcf_map_details_markers_array', 'protocol_property')->cvterm_id();
745 $self->_vcf_map_details_markers_array_cvterm_id($vcf_map_details_markers_array_cvterm_id);
746 my $igd_genotypeprop_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'igd number', 'genotype_property')->cvterm_id();
747 $self->_igd_genotypeprop_cvterm_id($igd_genotypeprop_cvterm_id);
748 my $accession_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'accession', 'stock_type')->cvterm_id();
749 $self->_accession_cvterm_id($accession_cvterm_id);
750 my $tissue_sample_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'tissue_sample', 'stock_type')->cvterm_id();
751 $self->_tissue_sample_cvterm_id($tissue_sample_cvterm_id);
752 my $plot_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'plot', 'stock_type')->cvterm_id();
753 $self->_plot_cvterm_id($plot_cvterm_id);
754 my $plant_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'plant', 'stock_type')->cvterm_id();
755 $self->_plant_cvterm_id($plant_cvterm_id);
756 my $tissue_sample_of_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($self->bcs_schema, 'tissue_sample_of', 'stock_relationship')->cvterm_id();
757 $self->_tissue_sample_of_cvterm_id($tissue_sample_of_cvterm_id);
759 my @trials_accessions;
760 foreach (@$trial_list){
761 my $trial = CXGN::Trial->new({bcs_schema=>$schema, trial_id=>$_});
762 my $accessions = $trial->get_accessions();
763 foreach (@$accessions){
764 push @trials_accessions, $_->{stock_id};
768 #If accessions are explicitly given, then accessions found from trials will not be added to the search.
769 if (!$accession_list || scalar(@$accession_list)==0) {
770 push @$accession_list, @trials_accessions;
773 #For projects inserted into database during the addition of genotypes and genotypeprops
774 if (scalar(@trials_accessions)==0){
775 if ($trial_list && scalar(@$trial_list)>0) {
776 my $trial_sql = join ("," , @$trial_list);
777 push @where_clause, "project.project_id in ($trial_sql)";
781 #For genotyping_data_project
782 if ($genotype_data_project_list && scalar($genotype_data_project_list)>0) {
783 my $sql = join ("," , @$genotype_data_project_list);
784 push @where_clause, "project.project_id in ($sql)";
786 if ($protocol_id_list && scalar(@$protocol_id_list)>0) {
787 my $protocol_sql = join ("," , @$protocol_id_list);
788 push @where_clause, "nd_protocol.nd_protocol_id in ($protocol_sql)";
790 if ($accession_list && scalar(@$accession_list)>0) {
791 my $accession_sql = join ("," , @$accession_list);
792 push @where_clause, " ( stock.stock_id in ($accession_sql) OR (accession_of_tissue_sample.stock_id in ($accession_sql) AND accession_of_tissue_sample.type_id = $accession_cvterm_id) ) ";
793 push @where_clause, "stock.type_id in ($accession_cvterm_id, $tissue_sample_cvterm_id)";
795 if ($tissue_sample_list && scalar(@$tissue_sample_list)>0) {
796 my $stock_sql = join ("," , @$tissue_sample_list);
797 push @where_clause, "stock.stock_id in ($stock_sql)";
798 push @where_clause, "stock.type_id = $tissue_sample_cvterm_id";
800 if ($markerprofile_id_list && scalar(@$markerprofile_id_list)>0) {
801 my $markerprofile_sql = join ("," , @$markerprofile_id_list);
802 push @where_clause, "genotype.genotype_id in ($markerprofile_sql)";
804 my %filtered_markers;
805 if ($marker_name_list && scalar(@$marker_name_list)>0) {
806 my $search_vals_sql = "'".join ("','" , @$marker_name_list)."'";
807 push @where_clause, "nd_protocolprop.value->'marker_names' \\?& array[$search_vals_sql]";
809 foreach (@$marker_name_list) {
810 $filtered_markers{$_}++;
813 if ($marker_search_hash_list && scalar(@$marker_search_hash_list)>0) {
814 foreach (@$marker_search_hash_list){
815 my $json_val = encode_json $_;
816 push @where_clause, "nd_protocolprop.value->'markers' \\@> $json_val"."::jsonb";
819 if ($marker_score_search_hash_list && scalar(@$marker_score_search_hash_list)>0) {
820 foreach (@$marker_score_search_hash_list){
821 my $json_val = encode_json $_;
822 push @where_clause, "genotype_values.value \\@> $json_val"."::jsonb";
826 my $where_clause = scalar(@where_clause)>0 ? " WHERE " . (join (" AND " , @where_clause)) : '';
828 my $offset_clause = '';
829 my $limit_clause = '';
830 if ($limit){
831 $limit_clause = " LIMIT $limit ";
833 if ($offset){
834 $offset_clause = " OFFSET $offset ";
837 my $stock_select = '';
838 if ($return_only_first_genotypeprop_for_stock) {
839 $stock_select = 'distinct on (stock.stock_id) stock.stock_id';
840 } else {
841 $stock_select = 'stock.stock_id';
844 # Setup protocolprop query handles
845 my @protocolprop_marker_hash_select_arr;
846 foreach (@$protocolprop_marker_hash_select){
847 push @protocolprop_marker_hash_select_arr, "s.value->>'$_'";
849 $self->_protocolprop_marker_hash_select_arr(\@protocolprop_marker_hash_select_arr);
851 my @protocolprop_top_key_select_arr;
852 foreach (@$protocolprop_top_key_select){
853 if ($_ ne 'markers' && $_ ne 'markers_array') {
854 push @protocolprop_top_key_select_arr, "value->>'$_'";
857 $self->_protocolprop_top_key_select_arr(\@protocolprop_top_key_select_arr);
859 my $protocolprop_hash_select_sql = scalar(@protocolprop_marker_hash_select_arr) > 0 ? ', '.join ',', @protocolprop_marker_hash_select_arr : '';
861 my $chromosome_where = '';
862 my $genotypeprop_chromosome_rank_string = '';
863 if ($chromosome_list && scalar(@$chromosome_list)>0) {
864 my $chromosome_list_sql = '\'' . join('\', \'', @$chromosome_list) . '\'';
865 $chromosome_where = " AND (s.value->>'chrom')::text IN ($chromosome_list_sql)";
866 #$genotypeprop_chromosome_rank_string = " AND value->>'CHROM' IN ($chromosome_list_sql) ";
868 my $start_position_where = '';
869 if (defined($start_position)) {
870 $start_position_where = " AND (s.value->>'pos')::int >= $start_position";
872 my $end_position_where = '';
873 if (defined($end_position)) {
874 $end_position_where = " AND (s.value->>'pos')::int <= $end_position";
876 my $marker_name_list_where = '';
877 if ($marker_name_list && scalar(@$marker_name_list)>0) {
878 my $search_vals_sql = '\''.join ('\', \'' , @$marker_name_list).'\'';
879 $marker_name_list_where = "AND (s.value->>'name')::text IN ($search_vals_sql)";
882 my $protocolprop_q = "SELECT nd_protocol_id, s.key $protocolprop_hash_select_sql
883 FROM nd_protocolprop, jsonb_each(nd_protocolprop.value) as s
884 WHERE nd_protocol_id = ? AND type_id = $vcf_map_details_markers_cvterm_id $chromosome_where $start_position_where $end_position_where $marker_name_list_where;";
885 #print STDERR Dumper $protocolprop_q;
886 my $protocolprop_h = $schema->storage->dbh()->prepare($protocolprop_q);
887 $self->_protocolprop_markers_h($protocolprop_h);
889 my $protocolprop_top_key_select_sql = scalar(@protocolprop_top_key_select_arr) > 0 ? ', '.join ',', @protocolprop_top_key_select_arr : '';
890 my $protocolprop_top_key_q = "SELECT nd_protocol_id $protocolprop_top_key_select_sql from nd_protocolprop WHERE nd_protocol_id = ? AND type_id = $vcf_map_details_cvterm_id;";
891 my $protocolprop_top_key_h = $schema->storage->dbh()->prepare($protocolprop_top_key_q);
892 $self->_protocolprop_top_key_h($protocolprop_top_key_h);
894 my $protocolprop_top_key_markers_q = "SELECT nd_protocol_id, value from nd_protocolprop WHERE nd_protocol_id = ? AND type_id = $vcf_map_details_markers_cvterm_id;";
895 my $protocolprop_top_key_markers_h = $schema->storage->dbh()->prepare($protocolprop_top_key_markers_q);
896 $self->_protocolprop_top_key_markers_h($protocolprop_top_key_markers_h);
898 my $protocolprop_top_key_markers_array_q = "SELECT nd_protocol_id, value from nd_protocolprop WHERE nd_protocol_id = ? AND type_id = $vcf_map_details_markers_array_cvterm_id;";
899 my $protocolprop_top_key_markers_array_h = $schema->storage->dbh()->prepare($protocolprop_top_key_markers_array_q);
900 $self->_protocolprop_top_key_markers_array_h($protocolprop_top_key_markers_array_h);
902 my $q = "SELECT $stock_select, igd_number_genotypeprop.value, nd_protocol.nd_protocol_id, nd_protocol.name, stock.uniquename, stock.type_id, stock_cvterm.name, genotype.genotype_id, genotype.uniquename, genotype.description, project.project_id, project.name, project.description, accession_of_tissue_sample.stock_id, accession_of_tissue_sample.uniquename, count(genotype.genotype_id) OVER() AS full_count
903 FROM stock
904 JOIN cvterm AS stock_cvterm ON(stock.type_id = stock_cvterm.cvterm_id)
905 LEFT JOIN stock_relationship ON(stock_relationship.subject_id=stock.stock_id AND stock_relationship.type_id = $tissue_sample_of_cvterm_id)
906 LEFT JOIN stock AS accession_of_tissue_sample ON(stock_relationship.object_id=accession_of_tissue_sample.stock_id)
907 JOIN nd_experiment_stock ON(stock.stock_id=nd_experiment_stock.stock_id)
908 JOIN nd_experiment USING(nd_experiment_id)
909 JOIN nd_experiment_protocol USING(nd_experiment_id)
910 JOIN nd_experiment_project USING(nd_experiment_id)
911 JOIN nd_experiment_genotype USING(nd_experiment_id)
912 JOIN nd_protocol USING(nd_protocol_id)
913 LEFT JOIN nd_protocolprop ON(nd_protocolprop.nd_protocol_id = nd_protocol.nd_protocol_id AND nd_protocolprop.type_id = $vcf_map_details_cvterm_id)
914 JOIN genotype USING(genotype_id)
915 LEFT JOIN genotypeprop AS igd_number_genotypeprop ON(igd_number_genotypeprop.genotype_id = genotype.genotype_id AND igd_number_genotypeprop.type_id = $igd_genotypeprop_cvterm_id)
916 JOIN project USING(project_id)
917 $where_clause
918 ORDER BY stock.stock_id, genotype.genotype_id ASC
919 $limit_clause
920 $offset_clause;";
922 #print STDERR Dumper $q;
923 my $h = $schema->storage->dbh()->prepare($q);
924 $h->execute();
925 my @genotypeprop_infos;
926 my %seen_protocol_ids;
927 while (my ($stock_id, $igd_number_json, $protocol_id, $protocol_name, $stock_name, $stock_type_id, $stock_type_name, $genotype_id, $genotype_uniquename, $genotype_description, $project_id, $project_name, $project_description, $accession_id, $accession_uniquename, $full_count) = $h->fetchrow_array()) {
929 my $germplasmName = '';
930 my $germplasmDbId = '';
932 my $igd_number_hash = $igd_number_json ? decode_json $igd_number_json : undef;
933 my $igd_number = $igd_number_hash ? $igd_number_hash->{'igd number'} : undef;
934 $igd_number = !$igd_number && $igd_number_hash ? $igd_number_hash->{'igd_number'} : undef;
936 if ($stock_type_name eq 'accession'){
937 $germplasmName = $stock_name;
938 $germplasmDbId = $stock_id;
940 if ($stock_type_name eq 'tissue_sample'){
941 $germplasmName = $accession_uniquename;
942 $germplasmDbId = $accession_id;
945 my $stock_object = CXGN::Stock::Accession->new({schema=>$self->bcs_schema, stock_id=>$germplasmDbId});
947 my %genotypeprop_info = (
948 markerProfileDbId => $genotype_id,
949 germplasmDbId => $germplasmDbId,
950 germplasmName => $germplasmName,
951 synonyms => $stock_object->synonyms,
952 stock_id => $stock_id,
953 stock_name => $stock_name,
954 stock_type_id => $stock_type_id,
955 stock_type_name => $stock_type_name,
956 genotypeDbId => $genotype_id,
957 genotypeUniquename => $genotype_uniquename,
958 genotypeDescription => $genotype_description,
959 analysisMethodDbId => $protocol_id,
960 analysisMethod => $protocol_name,
961 genotypingDataProjectDbId => $project_id,
962 genotypingDataProjectName => $project_name,
963 genotypingDataProjectDescription => $project_description,
964 igd_number => $igd_number,
965 full_count => $full_count
967 $seen_protocol_ids{$protocol_id}++;
968 push @genotypeprop_infos, \%genotypeprop_info;
970 $self->_genotypeprop_infos(\@genotypeprop_infos);
971 $self->_genotypeprop_infos_counter(0);
973 my @seen_protocol_ids = keys %seen_protocol_ids;
974 my %protocolprop_top_key_select_hash = map {$_ => 1} @protocolprop_top_key_select_arr;
975 my %selected_protocol_marker_info;
976 my %selected_protocol_top_key_info;
977 my $val;
979 foreach my $protocol_id (@seen_protocol_ids){
980 $protocolprop_h->execute($protocol_id);
981 while (my ($protocol_id, $marker_name, @protocolprop_info_return) = $protocolprop_h->fetchrow_array()) {
982 for my $s (0 .. scalar(@protocolprop_marker_hash_select_arr)-1){
983 $selected_protocol_marker_info{$protocol_id}->{$marker_name}->{$protocolprop_marker_hash_select->[$s]} = $protocolprop_info_return[$s];
985 $filtered_markers{$marker_name}++;
988 $protocolprop_top_key_h->execute($protocol_id);
989 while (my ($protocol_id, @protocolprop_top_key_return) = $protocolprop_top_key_h->fetchrow_array()) {
990 for my $s (0 .. scalar(@protocolprop_top_key_select_arr)-1){
991 my $protocolprop_i = $protocolprop_top_key_select->[$s];
992 if ($protocolprop_i eq 'header_information_lines' || $protocolprop_i eq 'marker_names') {
993 $val = decode_json $protocolprop_top_key_return[$s];
994 } else {
995 $val = $protocolprop_top_key_return[$s];
997 $selected_protocol_top_key_info{$protocol_id}->{$protocolprop_i} = $val;
1000 if (exists($protocolprop_top_key_select_hash{'markers'})) {
1001 $protocolprop_top_key_markers_h->execute($protocol_id);
1002 while (my ($protocol_id, $markers_value) = $protocolprop_top_key_markers_h->fetchrow_array()) {
1003 $selected_protocol_top_key_info{$protocol_id}->{'markers'} = decode_json $markers_value;
1006 if (exists($protocolprop_top_key_select_hash{'markers_array'})) {
1007 $protocolprop_top_key_markers_array_h->execute($protocol_id);
1008 while (my ($protocol_id, $markers_value) = $protocolprop_top_key_markers_array_h->fetchrow_array()) {
1009 $selected_protocol_top_key_info{$protocol_id}->{'markers_array'} = decode_json $markers_value;
1013 $self->_selected_protocol_marker_info(\%selected_protocol_marker_info);
1014 $self->_selected_protocol_top_key_info(\%selected_protocol_top_key_info);
1015 $self->_filtered_markers(\%filtered_markers);
1017 # Setup genotypeprop query handle
1018 my @genotypeprop_hash_select_arr;
1019 foreach (@$genotypeprop_hash_select){
1020 push @genotypeprop_hash_select_arr, "s.value->>'$_'";
1022 $self->_genotypeprop_hash_select_arr(\@genotypeprop_hash_select_arr);
1023 my $genotypeprop_hash_select_sql = scalar(@genotypeprop_hash_select_arr) > 0 ? ', '.join ',', @genotypeprop_hash_select_arr : '';
1025 my $filtered_markers_sql = '';
1026 # If filtered markers by providing a location range or chromosome these markers will be in %filered_markers, but we dont want to use this SQL if there are too many markers (>10000) )
1027 if (scalar(keys %filtered_markers) >0 && scalar(keys %filtered_markers) < 10000) {
1028 $filtered_markers_sql = " AND s.key IN ('". join ("','", keys %filtered_markers) ."')";
1031 my $genotypeprop_q = "SELECT s.key $genotypeprop_hash_select_sql
1032 FROM genotypeprop, jsonb_each(genotypeprop.value) as s
1033 WHERE genotypeprop_id = ? AND s.key != 'CHROM' AND type_id = $vcf_snp_genotyping_cvterm_id $filtered_markers_sql;";
1034 my $genotypeprop_h = $schema->storage->dbh()->prepare($genotypeprop_q);
1035 $self->_genotypeprop_h($genotypeprop_h);
1037 my $q2 = "SELECT genotypeprop_id
1038 FROM genotypeprop WHERE genotype_id = ? AND type_id=$vcf_snp_genotyping_cvterm_id $genotypeprop_chromosome_rank_string;";
1039 my $h2 = $schema->storage->dbh()->prepare($q2);
1040 $self->_iterator_genotypeprop_query_handle($h2);
1042 return;
1045 =head2 get_next_genotype_info()
1047 Function for getting genotype data iteratively. Iterative search retrieval minimizes memory usage.
1048 Should be used like:
1050 my $genotype_search = CXGN::Genotype::Search->new({
1051 bcs_schema=>$schema,
1052 ...etc
1054 $genotype_search->init_genotype_iterator();
1055 while (my ($count, $genotype_data) = $genotype_search->get_next_genotype_info) {
1056 #Do something with genotype data
1059 If you just want to get a file with the genotype result in a dosage matrix or VCF file, use get_cached_file_dosage_matrix or get_cached_file_VCF instead.
1060 If you want results in json format use get_cached_file_search_json
1061 If you want results in json format for only the metadata (no genotype call data), use get_cached_file_search_json()
1063 =cut
1065 sub get_next_genotype_info {
1066 my $self = shift;
1067 my $schema = $self->bcs_schema;
1068 my $trial_list = $self->trial_list;
1069 my $genotype_data_project_list = $self->genotype_data_project_list;
1070 my $protocol_id_list = $self->protocol_id_list;
1071 my $markerprofile_id_list = $self->markerprofile_id_list;
1072 my $accession_list = $self->accession_list;
1073 my $tissue_sample_list = $self->tissue_sample_list;
1074 my $marker_name_list = $self->marker_name_list;
1075 my $chromosome_list = $self->chromosome_list;
1076 my $start_position = $self->start_position;
1077 my $end_position = $self->end_position;
1078 my $genotypeprop_hash_select = $self->genotypeprop_hash_select;
1079 my $protocolprop_top_key_select = $self->protocolprop_top_key_select;
1080 my $protocolprop_marker_hash_select = $self->protocolprop_marker_hash_select;
1081 my $marker_search_hash_list = $self->marker_search_hash_list;
1082 my $marker_score_search_hash_list = $self->marker_score_search_hash_list;
1083 my $return_only_first_genotypeprop_for_stock = $self->return_only_first_genotypeprop_for_stock;
1084 my $h = $self->_iterator_query_handle();
1085 my $h_genotypeprop = $self->_iterator_genotypeprop_query_handle();
1086 my $protocolprop_markers_h = $self->_protocolprop_markers_h();
1087 my $protocolprop_top_key_h = $self->_protocolprop_top_key_h();
1088 my $protocolprop_top_key_markers_h = $self->_protocolprop_top_key_markers_h();
1089 my $genotypeprop_h = $self->_genotypeprop_h();
1090 my $protocolprop_top_key_markers_array_h = $self->_protocolprop_top_key_markers_array_h();
1091 my $protocolprop_marker_hash_select_arr = $self->_protocolprop_marker_hash_select_arr();
1092 my $protocolprop_top_key_select_arr = $self->_protocolprop_top_key_select_arr();
1093 my $genotypeprop_hash_select_arr = $self->_genotypeprop_hash_select_arr();
1094 my $snp_genotyping_cvterm_id = $self->_snp_genotyping_cvterm_id();
1095 my $vcf_snp_genotyping_cvterm_id = $self->_vcf_snp_genotyping_cvterm_id();
1096 my $vcf_map_details_cvterm_id = $self->_vcf_map_details_cvterm_id();
1097 my $vcf_map_details_markers_cvterm_id = $self->_vcf_map_details_markers_cvterm_id();
1098 my $vcf_map_details_markers_array_cvterm_id = $self->_vcf_map_details_markers_array_cvterm_id();
1099 my $igd_genotypeprop_cvterm_id = $self->_igd_genotypeprop_cvterm_id();
1100 my $accession_cvterm_id = $self->_accession_cvterm_id();
1101 my $tissue_sample_cvterm_id = $self->_tissue_sample_cvterm_id();
1102 my $tissue_sample_of_cvterm_id = $self->_tissue_sample_of_cvterm_id();
1105 my $total_count = 0;
1106 my @genotypeprop_array;
1107 my %protocolprop_hash;
1109 my %selected_protocol_marker_info = %{$self->_selected_protocol_marker_info};
1110 my %selected_protocol_top_key_info = %{$self->_selected_protocol_top_key_info};
1111 my %filtered_markers = %{$self->_filtered_markers};
1112 my $genotypeprop_infos = $self->_genotypeprop_infos;
1113 my $genotypeprop_infos_counter = $self->_genotypeprop_infos_counter;
1115 if ($genotypeprop_infos->[$genotypeprop_infos_counter]) {
1116 my %genotypeprop_info = %{$genotypeprop_infos->[$genotypeprop_infos_counter]};
1117 my $genotype_id = $genotypeprop_info{markerProfileDbId};
1118 my $protocol_id = $genotypeprop_info{analysisMethodDbId};
1119 my $full_count = $genotypeprop_info{full_count};
1121 $h_genotypeprop->execute($genotype_id);
1122 while (my ($genotypeprop_id) = $h_genotypeprop->fetchrow_array) {
1123 $genotypeprop_h->execute($genotypeprop_id);
1124 while (my ($marker_name, @genotypeprop_info_return) = $genotypeprop_h->fetchrow_array()) {
1125 for my $s (0 .. scalar(@$genotypeprop_hash_select_arr)-1){
1126 $genotypeprop_info{selected_genotype_hash}->{$marker_name}->{$genotypeprop_hash_select->[$s]} = $genotypeprop_info_return[$s];
1131 my $selected_marker_info = $selected_protocol_marker_info{$protocol_id} ? $selected_protocol_marker_info{$protocol_id} : {};
1132 my $selected_protocol_info = $selected_protocol_top_key_info{$protocol_id} ? $selected_protocol_top_key_info{$protocol_id} : {};
1133 my @all_protocol_marker_names = keys %$selected_marker_info;
1134 $selected_protocol_info->{markers} = $selected_marker_info;
1135 $genotypeprop_info{resultCount} = scalar(keys %{$genotypeprop_info{selected_genotype_hash}});
1136 $genotypeprop_info{all_protocol_marker_names} = \@all_protocol_marker_names;
1137 $genotypeprop_info{selected_protocol_hash} = $selected_protocol_info;
1139 $self->_genotypeprop_infos_counter($self->_genotypeprop_infos_counter + 1);
1141 return ($full_count, \%genotypeprop_info);
1144 return;
1147 sub key {
1148 my $self = shift;
1149 my $datatype = shift;
1151 my $json = JSON->new();
1152 #preserve order of hash keys to get same text
1153 $json = $json->canonical();
1154 my $accessions = $json->encode( $self->accession_list() || [] );
1155 my $tissues = $json->encode( $self->tissue_sample_list() || [] );
1156 my $trials = $json->encode( $self->trial_list() || [] );
1157 my $protocols = $json->encode( $self->protocol_id_list() || [] );
1158 my $markerprofiles = $json->encode( $self->markerprofile_id_list() || [] );
1159 my $genotypedataprojects = $json->encode( $self->genotype_data_project_list() || [] );
1160 my $markernames = $json->encode( $self->marker_name_list() || [] );
1161 my $genotypeprophash = $json->encode( $self->genotypeprop_hash_select() || [] );
1162 my $protocolprophash = $json->encode( $self->protocolprop_top_key_select() || [] );
1163 my $protocolpropmarkerhash = $json->encode( $self->protocolprop_marker_hash_select() || [] );
1164 my $chromosomes = $json->encode( $self->chromosome_list() || [] );
1165 my $start = $self->start_position() || '' ;
1166 my $end = $self->end_position() || '';
1167 my $prevent_transpose = $self->prevent_transpose() || '';
1168 my $key = md5_hex($accessions.$tissues.$trials.$protocols.$markerprofiles.$genotypedataprojects.$markernames.$genotypeprophash.$protocolprophash.$protocolpropmarkerhash.$chromosomes.$start.$end.$self->return_only_first_genotypeprop_for_stock().$prevent_transpose.$self->limit().$self->offset()."_$datatype");
1169 #print STDERR Dumper($genotypeprophash);
1170 return $key;
1173 =head2 get_cached_file_search_json()
1175 Function for getting the file handle for the genotype search result from cache. Will write the cached file if it does not exist.
1176 Returns the genotype result in a line-by-line json format.
1177 Uses the file iterator to write the cached file, so that it uses little memory.
1179 First line in file has all marker objects, while subsequent lines have markerprofiles for each sample
1180 If you want results in json format for only the metadata (no genotype call data), pass 1 for metadata_only
1182 =cut
1184 sub get_cached_file_search_json {
1185 my $self = shift;
1186 my $shared_cluster_dir_config = shift;
1187 my $metadata_only = shift;
1188 my $protocol_ids = $self->protocol_id_list;
1190 my $metadata_only_string = $metadata_only ? "metadata_only" : "all_data";
1191 my $key = $self->key("get_cached_file_search_json_v03_".$metadata_only_string);
1192 $self->cache( Cache::File->new( cache_root => $self->cache_root() ));
1194 my $file_handle;
1195 if ($self->cache()->exists($key) && !$self->forbid_cache()) {
1196 $file_handle = $self->cache()->handle($key);
1197 } else {
1198 # Set the temp dir and temp output file
1199 my $tmp_output_dir = $shared_cluster_dir_config."/tmp_genotype_download_json";
1200 print STDERR "creating cached json file in $tmp_output_dir\n";
1201 mkdir $tmp_output_dir if ! -d $tmp_output_dir;
1202 my ($tmp_fh, $tempfile) = tempfile(
1203 "wizard_download_XXXXX",
1204 DIR=> $tmp_output_dir,
1207 my @all_marker_objects;
1208 foreach (@$protocol_ids) {
1209 my $protocol = CXGN::Genotype::Protocol->new({
1210 bcs_schema => $self->bcs_schema,
1211 nd_protocol_id => $_,
1212 chromosome_list=>$self->chromosome_list,
1213 start_position=>$self->start_position,
1214 end_position=>$self->end_position,
1215 marker_name_list=>$self->marker_name_list
1217 my $markers = $protocol->markers;
1218 push @all_marker_objects, values %$markers;
1221 foreach (@all_marker_objects) {
1222 $self->_filtered_markers()->{$_->{name}}++;
1225 $self->init_genotype_iterator();
1227 #VCF should be sorted by chromosome and position
1228 no warnings 'uninitialized';
1229 @all_marker_objects = sort { $a->{chrom} cmp $b->{chrom} || $a->{pos} <=> $b->{pos} || $a->{name} cmp $b->{name} } @all_marker_objects;
1230 @all_marker_objects = $self->_check_filtered_markers(\@all_marker_objects);
1232 my $counter = 0;
1233 while (my $geno = $self->get_next_genotype_info) {
1235 # OLD GENOTYPING PROTCOLS DID NOT HAVE ND_PROTOCOLPROP INFO...
1236 if (scalar(@all_marker_objects) == 0) {
1237 foreach my $o (sort genosort keys %{$geno->{selected_genotype_hash}}) {
1238 push @all_marker_objects, {name => $o};
1240 @all_marker_objects = $self->_check_filtered_markers(\@all_marker_objects);
1243 if ($metadata_only) {
1244 @all_marker_objects = [];
1245 delete $geno->{selected_genotype_hash};
1246 delete $geno->{selected_protocol_hash};
1247 delete $geno->{all_protocol_marker_names};
1250 my $genotype_string = encode_json $geno;
1251 $genotype_string .= "\n";
1252 if ($counter == 0) {
1253 my $marker_string = encode_json \@all_marker_objects;
1254 $marker_string .= "\n";
1255 write_file($tempfile, {append => 1}, $marker_string);
1257 write_file($tempfile, {append => 1}, $genotype_string);
1258 $counter++;
1260 close $tempfile;
1262 open my $out_copy, '<', $tempfile or die "Can't open output file: $!";
1264 $self->cache()->set($key, '');
1265 $file_handle = $self->cache()->handle($key);
1266 copy($out_copy, $file_handle);
1268 close $out_copy;
1269 $file_handle = $self->cache()->handle($key);
1271 return $file_handle;
1274 =head2 get_cached_file_dosage_matrix()
1276 Function for getting the file handle for the genotype search result from cache. Will write the cached file if it does not exist.
1277 Returns the genotype result as a dosage matrix format.
1278 Uses the file iterator to write the cached file, so that it uses little memory.
1280 =cut
1282 sub get_cached_file_dosage_matrix {
1283 my $self = shift;
1284 my $shared_cluster_dir_config = shift;
1285 my $backend_config = shift;
1286 my $cluster_host_config = shift;
1287 my $web_cluster_queue_config = shift;
1288 my $basepath_config = shift;
1289 my $protocol_ids = $self->protocol_id_list;
1291 my $key = $self->key("get_cached_file_dosage_matrix_v03");
1292 $self->cache( Cache::File->new( cache_root => $self->cache_root() ));
1294 my $file_handle;
1295 if ($self->cache()->exists($key) && !$self->forbid_cache()) {
1296 $file_handle = $self->cache()->handle($key);
1298 else {
1299 # Set the temp dir and temp output file
1300 my $tmp_output_dir = $shared_cluster_dir_config."/tmp_genotype_download_dosage_matrix";
1301 mkdir $tmp_output_dir if ! -d $tmp_output_dir;
1302 my ($tmp_fh, $tempfile) = tempfile(
1303 "wizard_download_XXXXX",
1304 DIR=> $tmp_output_dir,
1307 my @all_marker_objects;
1308 foreach (@$protocol_ids) {
1309 my $protocol = CXGN::Genotype::Protocol->new({
1310 bcs_schema => $self->bcs_schema,
1311 nd_protocol_id => $_,
1312 chromosome_list=>$self->chromosome_list,
1313 start_position=>$self->start_position,
1314 end_position=>$self->end_position,
1315 marker_name_list=>$self->marker_name_list
1317 my $markers = $protocol->markers;
1318 push @all_marker_objects, values %$markers;
1321 foreach (@all_marker_objects) {
1322 $self->_filtered_markers()->{$_->{name}}++;
1324 $self->init_genotype_iterator();
1326 #VCF should be sorted by chromosome and position
1327 no warnings 'uninitialized';
1328 @all_marker_objects = sort { $a->{chrom} cmp $b->{chrom} || $a->{pos} <=> $b->{pos} || $a->{name} cmp $b->{name} } @all_marker_objects;
1329 @all_marker_objects = $self->_check_filtered_markers(\@all_marker_objects);
1331 my $counter = 0;
1332 while (my $geno = $self->get_next_genotype_info) {
1334 # OLD GENOTYPING PROTCOLS DID NOT HAVE ND_PROTOCOLPROP INFO...
1335 if (scalar(@all_marker_objects) == 0) {
1336 foreach my $o (sort genosort keys %{$geno->{selected_genotype_hash}}) {
1337 push @all_marker_objects, {name => $o};
1339 @all_marker_objects = $self->_check_filtered_markers(\@all_marker_objects);
1342 my $genotype_string = "";
1343 if ($counter == 0) {
1344 $genotype_string .= "Marker\t";
1345 foreach my $m (@all_marker_objects) {
1346 $genotype_string .= $m->{name} . "\t";
1348 $genotype_string .= "\n";
1350 my $genotype_id = $geno->{germplasmName};
1351 if (!$self->return_only_first_genotypeprop_for_stock) {
1352 $genotype_id = $geno->{germplasmName}."|".$geno->{markerProfileDbId};
1354 my $genotype_data_string = "";
1355 foreach my $m (@all_marker_objects) {
1356 my $current_genotype = $geno->{selected_genotype_hash}->{$m->{name}}->{DS};
1357 $genotype_data_string .= $current_genotype."\t";
1360 $genotype_string .= $genotype_id."\t".$genotype_data_string."\n";
1362 write_file($tempfile, {append => 1}, $genotype_string);
1363 $counter++;
1366 my $transpose_tempfile = $tempfile . "_transpose";
1368 my $cmd = CXGN::Tools::Run->new(
1370 backend => $backend_config,
1371 submit_host => $cluster_host_config,
1372 temp_base => $tmp_output_dir,
1373 queue => $web_cluster_queue_config,
1374 do_cleanup => 0,
1375 out_file => $transpose_tempfile,
1376 # out_file => $transpose_tempfile,
1377 # don't block and wait if the cluster looks full
1378 max_cluster_jobs => 1_000_000_000,
1382 my $out_copy;
1383 if ($self->prevent_transpose()) {
1384 open $out_copy, '<', $tempfile or die "Can't open output file: $!";
1386 else {
1387 # Do the transposition job on the cluster
1388 $cmd->run_cluster(
1389 "perl ",
1390 $basepath_config."/bin/transpose_matrix.pl",
1391 $tempfile,
1393 $cmd->is_cluster(1);
1394 $cmd->wait;
1396 open $out_copy, '<', $transpose_tempfile or die "Can't open output file: $!";
1399 $self->cache()->set($key, '');
1400 $file_handle = $self->cache()->handle($key);
1401 copy($out_copy, $file_handle);
1403 close $out_copy;
1404 $file_handle = $self->cache()->handle($key);
1406 return $file_handle;
1409 =head2 get_cached_file_dosage_matrix_compute_from_parents()
1410 Computes the genotypes for the queried accessions computed from the parental dosages. Parents are known from pedigrees of accessions.
1411 Function for getting the file handle for the genotype search result from cache. Will write the cached file if it does not exist.
1412 Returns the genotype result as a dosage matrix format.
1413 Uses the file iterator to write the cached file, so that it uses little memory.
1414 =cut
1416 sub get_cached_file_dosage_matrix_compute_from_parents {
1417 my $self = shift;
1418 my $shared_cluster_dir_config = shift;
1419 my $backend_config = shift;
1420 my $cluster_host_config = shift;
1421 my $web_cluster_queue_config = shift;
1422 my $basepath_config = shift;
1423 my $schema = $self->bcs_schema;
1424 my $protocol_ids = $self->protocol_id_list;
1425 my $marker_name_list = $self->marker_name_list;
1426 my $accession_ids = $self->accession_list;
1427 my $cache_root_dir = $self->cache_root();
1429 if (scalar(@$protocol_ids)>1) {
1430 die "Only one protocol at a time can be done when computing genotypes from parents\n";
1432 my $protocol_id = $protocol_ids->[0];
1434 my $key = $self->key("get_cached_file_dosage_matrix_compute_from_parents_v03");
1435 $self->cache( Cache::File->new( cache_root => $cache_root_dir ));
1437 my $file_handle;
1438 if ($self->cache()->exists($key) && !$self->forbid_cache()) {
1439 $file_handle = $self->cache()->handle($key);
1441 else {
1442 # Set the temp dir and temp output file
1443 my $tmp_output_dir = $shared_cluster_dir_config."/tmp_genotype_download_dosage_matrix_compute_from_parents";
1444 mkdir $tmp_output_dir if ! -d $tmp_output_dir;
1445 my ($tmp_fh, $tempfile) = tempfile(
1446 "wizard_download_XXXXX",
1447 DIR=> $tmp_output_dir,
1450 my $accession_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'accession', 'stock_type')->cvterm_id();
1451 my $female_parent_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'female_parent', 'stock_relationship')->cvterm_id();
1452 my $male_parent_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'male_parent', 'stock_relationship')->cvterm_id();
1454 my $accession_list_string = join ',', @$accession_ids;
1455 my $q = "SELECT accession.stock_id, female_parent.stock_id, male_parent.stock_id
1456 FROM stock AS accession
1457 JOIN stock_relationship AS female_parent_rel ON(accession.stock_id=female_parent_rel.object_id AND female_parent_rel.type_id=$female_parent_cvterm_id)
1458 JOIN stock AS female_parent ON(female_parent_rel.subject_id = female_parent.stock_id AND female_parent.type_id=$accession_cvterm_id)
1459 JOIN stock_relationship AS male_parent_rel ON(accession.stock_id=male_parent_rel.object_id AND male_parent_rel.type_id=$male_parent_cvterm_id)
1460 JOIN stock AS male_parent ON(male_parent_rel.subject_id = male_parent.stock_id AND male_parent.type_id=$accession_cvterm_id)
1461 WHERE accession.stock_id IN ($accession_list_string) AND accession.type_id=$accession_cvterm_id ORDER BY accession.stock_id;";
1462 my $h = $schema->storage->dbh()->prepare($q);
1463 $h->execute();
1464 my @accession_stock_ids_found = ();
1465 my @female_stock_ids_found = ();
1466 my @male_stock_ids_found = ();
1467 while (my ($accession_stock_id, $female_parent_stock_id, $male_parent_stock_id) = $h->fetchrow_array()) {
1468 push @accession_stock_ids_found, $accession_stock_id;
1469 push @female_stock_ids_found, $female_parent_stock_id;
1470 push @male_stock_ids_found, $male_parent_stock_id;
1473 # print STDERR Dumper \@accession_stock_ids_found;
1474 # print STDERR Dumper \@female_stock_ids_found;
1475 # print STDERR Dumper \@male_stock_ids_found;
1477 my %unique_germplasm;
1478 my $protocol = CXGN::Genotype::Protocol->new({
1479 bcs_schema => $schema,
1480 nd_protocol_id => $protocol_id,
1481 chromosome_list=>$self->chromosome_list,
1482 start_position=>$self->start_position,
1483 end_position=>$self->end_position,
1484 marker_name_list=>$self->marker_name_list
1486 my $markers = $protocol->markers;
1487 my @all_marker_objects = values %$markers;
1489 foreach (@all_marker_objects) {
1490 $self->_filtered_markers()->{$_->{name}}++;
1493 no warnings 'uninitialized';
1494 @all_marker_objects = sort { $a->{chrom} cmp $b->{chrom} || $a->{pos} <=> $b->{pos} || $a->{name} cmp $b->{name} } @all_marker_objects;
1495 @all_marker_objects = $self->_check_filtered_markers(\@all_marker_objects);
1497 my $counter = 0;
1498 for my $i (0..scalar(@accession_stock_ids_found)-1) {
1499 my $female_stock_id = $female_stock_ids_found[$i];
1500 my $male_stock_id = $male_stock_ids_found[$i];
1501 my $accession_stock_id = $accession_stock_ids_found[$i];
1503 my $dataset = CXGN::Dataset::Cache->new({
1504 people_schema=>$self->people_schema,
1505 schema=>$schema,
1506 cache_root=>$cache_root_dir,
1507 accessions=>[$female_stock_id, $male_stock_id]
1509 my $genotypes = $dataset->retrieve_genotypes($protocol_id, ['DS'], ['markers'], ['name'], 1, $self->chromosome_list, $self->start_position, $self->end_position, $self->marker_name_list);
1511 # For old protocols with no protocolprop info...
1512 if (scalar(@all_marker_objects) == 0) {
1513 foreach my $o (sort genosort keys %{$genotypes->[0]->{selected_genotype_hash}}) {
1514 push @all_marker_objects, {name => $o};
1516 @all_marker_objects = $self->_check_filtered_markers(\@all_marker_objects);
1519 my $genotype_string = "";
1520 if ($counter == 0) {
1521 $genotype_string .= "Marker\t";
1522 foreach my $m (@all_marker_objects) {
1523 $genotype_string .= $m->{name} . "\t";
1525 $genotype_string .= "\n";
1528 my $geno = CXGN::Genotype::ComputeHybridGenotype->new({
1529 parental_genotypes=>$genotypes,
1530 marker_objects=>\@all_marker_objects
1532 my $progeny_genotype = $geno->get_hybrid_genotype();
1533 my $genotype_string_scores = join "\t", @$progeny_genotype;
1535 my $genotype_id = $accession_stock_id;
1536 if (!$self->return_only_first_genotypeprop_for_stock) {
1537 $genotype_id = $accession_stock_id."|".$geno->{markerProfileDbId};
1540 $genotype_string .= $genotype_id."\t".$genotype_string_scores."\n";
1541 write_file($tempfile, {append => 1}, $genotype_string);
1542 $counter++;
1545 my $transpose_tempfile = $tempfile . "_transpose";
1547 my $cmd = CXGN::Tools::Run->new(
1549 backend => $backend_config,
1550 submit_host => $cluster_host_config,
1551 temp_base => $tmp_output_dir,
1552 queue => $web_cluster_queue_config,
1553 do_cleanup => 0,
1554 out_file => $transpose_tempfile,
1555 # out_file => $transpose_tempfile,
1556 # don't block and wait if the cluster looks full
1557 max_cluster_jobs => 1_000_000_000,
1561 my $out_copy;
1562 if ($self->prevent_transpose()) {
1563 open $out_copy, '<', $tempfile or die "Can't open output file: $!";
1565 else {
1566 # Do the transposition job on the cluster
1567 $cmd->run_cluster(
1568 "perl ",
1569 $basepath_config."/bin/transpose_matrix.pl",
1570 $tempfile,
1572 $cmd->is_cluster(1);
1573 $cmd->wait;
1575 open $out_copy, '<', $transpose_tempfile or die "Can't open output file: $!";
1578 $self->cache()->set($key, '');
1579 $file_handle = $self->cache()->handle($key);
1580 copy($out_copy, $file_handle);
1582 close $out_copy;
1583 $file_handle = $self->cache()->handle($key);
1585 return $file_handle;
1588 sub get_cached_file_VCF {
1589 my $self = shift;
1590 my $shared_cluster_dir_config = shift;
1591 my $backend_config = shift;
1592 my $cluster_host_config = shift;
1593 my $web_cluster_queue_config = shift;
1594 my $basepath_config = shift;
1596 my $key = $self->key("get_cached_file_VCF_v04");
1597 $self->cache( Cache::File->new( cache_root => $self->cache_root() ));
1598 my $protocol_ids = $self->protocol_id_list;
1600 my $file_handle;
1601 if ($self->cache()->exists($key) && !$self->forbid_cache()) {
1602 $file_handle = $self->cache()->handle($key);
1603 } else {
1604 # Set the temp dir and temp output file
1605 my $tmp_output_dir = $shared_cluster_dir_config."/tmp_genotype_download_VCF";
1606 print STDERR "creating cached VCF file in $tmp_output_dir\n";
1607 mkdir $tmp_output_dir if ! -d $tmp_output_dir;
1608 my ($tmp_fh, $tempfile) = tempfile(
1609 "wizard_download_XXXXX",
1610 DIR=> $tmp_output_dir,
1613 my $time = DateTime->now();
1614 my $timestamp = $time->ymd()."_".$time->hms();
1616 my @all_protocol_info_lines;
1618 #Get all marker information for the protocol(s) requested. this is important if they are requesting subsets of markers or if they are querying more than one protocol at once. Also important for ordering VCF output. Old genotypes did not have protocolprop marker info so markers are taken from first genotypeprop return below.
1619 my @all_marker_objects;
1620 my %unique_germplasm;
1621 foreach (@$protocol_ids) {
1622 my $protocol = CXGN::Genotype::Protocol->new({
1623 bcs_schema => $self->bcs_schema,
1624 nd_protocol_id => $_,
1625 chromosome_list=>$self->chromosome_list,
1626 start_position=>$self->start_position,
1627 end_position=>$self->end_position,
1628 marker_name_list=>$self->marker_name_list
1630 my $markers = $protocol->markers;
1631 push @all_protocol_info_lines, @{$protocol->header_information_lines};
1632 push @all_marker_objects, values %$markers;
1634 push @all_protocol_info_lines, "##source=FILE GENERATED BY BREEDBASE";
1635 push @all_protocol_info_lines, "##fileDate=$timestamp";
1637 foreach (@all_marker_objects) {
1638 $self->_filtered_markers()->{$_->{name}}++;
1641 $self->init_genotype_iterator();
1643 #VCF should be sorted by chromosome and position
1644 no warnings 'uninitialized';
1645 @all_marker_objects = sort { $a->{chrom} cmp $b->{chrom} || $a->{pos} <=> $b->{pos} || $a->{name} cmp $b->{name} } @all_marker_objects;
1646 @all_marker_objects = $self->_check_filtered_markers(\@all_marker_objects);
1648 my $counter = 0;
1649 while (my $geno = $self->get_next_genotype_info) {
1651 # OLD GENOTYPING PROTCOLS DID NOT HAVE ND_PROTOCOLPROP INFO...
1652 if (scalar(@all_marker_objects) == 0) {
1653 foreach my $o (sort genosort keys %{$geno->{selected_genotype_hash}}) {
1654 push @all_marker_objects, {name => $o};
1656 @all_marker_objects = $self->_check_filtered_markers(\@all_marker_objects);
1659 $unique_germplasm{$geno->{germplasmDbId}}++;
1661 #print STDERR "GENO = ".Dumper($geno);
1663 my $genotype_string = "";
1664 if ($counter == 0) {
1665 $genotype_string .= "#CHROM\t";
1666 foreach my $m (@all_marker_objects) {
1667 my $chrom = $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{chrom};
1669 if (! $chrom) {
1670 ($chrom) = split /\_/, $m->{name};
1671 #print STDERR "Warning! No chrom data, using $chrom extracted from $m->{name}\n";
1673 #$genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{chrom} . "\t";
1674 $genotype_string .= $chrom ."\t";
1676 $genotype_string .= "\n";
1677 $genotype_string .= "POS\t";
1678 foreach my $m (@all_marker_objects) {
1679 my $pos = $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{pos};
1680 if (! $pos) {
1681 (undef, $pos) = split /\_/, $m->{name};
1682 #print STDERR "Warning! No position data, using $pos extracted from $m->{name}\n";
1684 #$genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{pos} . "\t";
1685 $genotype_string .= $pos ."\t";
1687 $genotype_string .= "\n";
1688 $genotype_string .= "ID\t";
1689 foreach my $m (@all_marker_objects) {
1690 $genotype_string .= $m->{name} . "\t";
1692 $genotype_string .= "\n";
1693 $genotype_string .= "REF\t";
1694 foreach my $m (@all_marker_objects) {
1695 $genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{ref} . "\t";
1697 $genotype_string .= "\n";
1698 $genotype_string .= "ALT\t";
1699 foreach my $m (@all_marker_objects) {
1700 $genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{alt} . "\t";
1702 $genotype_string .= "\n";
1703 $genotype_string .= "QUAL\t";
1704 foreach my $m (@all_marker_objects) {
1705 $genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{qual} . "\t";
1707 $genotype_string .= "\n";
1708 $genotype_string .= "FILTER\t";
1709 foreach my $m (@all_marker_objects) {
1710 $genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{filter} . "\t";
1712 $genotype_string .= "\n";
1713 $genotype_string .= "INFO\t";
1714 foreach my $m (@all_marker_objects) {
1715 $genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{info} . "\t";
1717 $genotype_string .= "\n";
1718 $genotype_string .= "FORMAT\t";
1719 foreach my $m (@all_marker_objects) {
1720 my $format = $m->{format};
1721 my @format_array;
1722 #In case of old genotyping protocols where there was no protocolprop marker info
1723 if (!$format) {
1724 my $first_g = $geno->{selected_genotype_hash}->{$m->{name}};
1725 if (defined($first_g->{'GT'})) {
1726 push @format_array, 'GT';
1728 foreach my $k (sort keys %$first_g) {
1729 if ($k eq 'GT') {
1730 } elsif (defined($first_g->{$k})) {
1731 push @format_array, $k;
1734 } else {
1735 @format_array = split ':', $format;
1738 if (scalar(@format_array) > 1) { #ONLY ADD NT FOR NOT OLD GENOTYPING PROTOCOLS
1739 my %format_check = map {$_ => 1} @format_array;
1740 if (!exists($format_check{'NT'})) {
1741 push @format_array, 'NT';
1743 if (!exists($format_check{'DS'})) {
1744 push @format_array, 'DS';
1747 $format = join ':', @format_array;
1748 $genotype_string .= $format . "\t";
1749 $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{format} = $format;
1750 $m->{format} = $format;
1752 $genotype_string .= "\n";
1754 my $genotype_id = $geno->{germplasmName};
1755 if (!$self->return_only_first_genotypeprop_for_stock) {
1756 $genotype_id = $geno->{germplasmName}."|".$geno->{markerProfileDbId};
1758 my $genotype_data_string = "";
1759 foreach my $m (@all_marker_objects) {
1760 my @current_geno = ();
1761 my $name = $m->{name};
1762 my $format = $m->{format};
1763 my @format;
1764 my $gt;
1766 #In case of old genotyping protocols where there was no protocolprop marker info
1767 if (!$format) {
1768 my $first_g = $geno->{selected_genotype_hash}->{$name};
1769 foreach my $k (sort keys %$first_g) {
1770 if (defined($first_g->{$k})) {
1771 push @format, $k;
1774 } else {
1775 @format = split ':', $format;
1778 #VCF requires the GT field to be first
1779 if (defined($geno->{selected_genotype_hash}->{$m->{name}}->{'GT'})) {
1780 my $val = $geno->{selected_genotype_hash}->{$m->{name}}->{'GT'};
1781 if ($val eq '') {
1782 $val = './.';
1784 push @current_geno, $val;
1785 } else {
1786 push @current_geno, './.';
1788 foreach my $format_key (@format) {
1789 my $val = $geno->{selected_genotype_hash}->{$m->{name}}->{$format_key};
1790 if ($format_key eq 'GT') {
1791 } else {
1792 push @current_geno, $val;
1795 my $current_g = join ':', @current_geno;
1796 $genotype_data_string .= $current_g."\t";
1798 $genotype_string .= $genotype_id."\t".$genotype_data_string."\n";
1800 write_file($tempfile, {append => 1}, $genotype_string);
1801 $counter++;
1804 my $transpose_tempfile = $tempfile . "_transpose";
1806 my $cmd = CXGN::Tools::Run->new(
1808 backend => $backend_config,
1809 submit_host => $cluster_host_config,
1810 temp_base => $tmp_output_dir,
1811 queue => $web_cluster_queue_config,
1812 do_cleanup => 0,
1813 out_file => $transpose_tempfile,
1814 # out_file => $transpose_tempfile,
1815 # don't block and wait if the cluster looks full
1816 max_cluster_jobs => 1_000_000_000,
1820 # Do the transposition job on the cluster
1821 $cmd->run_cluster(
1822 "perl ",
1823 $basepath_config."/bin/transpose_matrix.pl",
1824 $tempfile,
1826 $cmd->is_cluster(1);
1827 $cmd->wait;
1829 my $transpose_tempfile_hdr = $tempfile . "_transpose_hdr";
1831 open my $in, '<', $transpose_tempfile or die "Can't read input file: $!";
1832 open my $out, '>', $transpose_tempfile_hdr or die "Can't write output file: $!";
1834 #Get synonyms of the accessions
1835 my $stocklookup = CXGN::Stock::StockLookup->new({schema => $self->bcs_schema});
1836 my @accession_ids = keys %unique_germplasm;
1837 my $synonym_hash = $stocklookup->get_stock_synonyms('stock_id', 'accession', \@accession_ids);
1838 my $synonym_string = "##SynonymsOfAccessions=\"";
1839 while( my( $uniquename, $synonym_list ) = each %{$synonym_hash}){
1840 if(scalar(@{$synonym_list})>0){
1841 if(not length($synonym_string)<1){
1842 $synonym_string.=" ";
1844 $synonym_string.=$uniquename."=(";
1845 $synonym_string.= (join ", ", @{$synonym_list}).")";
1848 $synonym_string .= "\"";
1849 push @all_protocol_info_lines, $synonym_string;
1851 my $vcf_header = join "\n", @all_protocol_info_lines;
1852 $vcf_header .= "\n";
1854 print $out $vcf_header;
1856 while( <$in> )
1858 print $out $_;
1860 close $in;
1861 close $out;
1863 open my $out_copy, '<', $transpose_tempfile_hdr or die "Can't open output file: $!";
1865 $self->cache()->set($key, '');
1866 $file_handle = $self->cache()->handle($key);
1867 copy($out_copy, $file_handle);
1869 close $out_copy;
1870 $file_handle = $self->cache()->handle($key);
1872 return $file_handle;
1875 =head2 get_cached_file_VCF_compute_from_parents()
1876 Computes the genotypes for the queried accessions computed from the parental dosages. Parents are known from pedigrees of accessions.
1877 Function for getting the file handle for the genotype search result from cache. Will write the cached file if it does not exist.
1878 Returns the genotype result as a dosage matrix format.
1879 Uses the file iterator to write the cached file, so that it uses little memory.
1880 =cut
1882 sub get_cached_file_VCF_compute_from_parents {
1883 my $self = shift;
1884 my $shared_cluster_dir_config = shift;
1885 my $backend_config = shift;
1886 my $cluster_host_config = shift;
1887 my $web_cluster_queue_config = shift;
1888 my $basepath_config = shift;
1889 my $schema = $self->bcs_schema;
1890 my $protocol_ids = $self->protocol_id_list;
1891 my $accession_ids = $self->accession_list;
1892 my $cache_root_dir = $self->cache_root();
1893 my $marker_name_list = $self->marker_name_list;
1895 if (scalar(@$protocol_ids)>1) {
1896 die "Only one protocol at a time can be done when computing genotypes from parents\n";
1898 my $protocol_id = $protocol_ids->[0];
1900 my $key = $self->key("get_cached_file_VCF_compute_from_parents_v04");
1901 $self->cache( Cache::File->new( cache_root => $cache_root_dir ));
1903 my $file_handle;
1904 if ($self->cache()->exists($key) && !$self->forbid_cache()) {
1905 $file_handle = $self->cache()->handle($key);
1907 else {
1908 # Set the temp dir and temp output file
1909 my $tmp_output_dir = $shared_cluster_dir_config."/tmp_genotype_download_VCF_compute_from_parents";
1910 mkdir $tmp_output_dir if ! -d $tmp_output_dir;
1911 my ($tmp_fh, $tempfile) = tempfile(
1912 "wizard_download_XXXXX",
1913 DIR=> $tmp_output_dir,
1916 my $accession_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'accession', 'stock_type')->cvterm_id();
1917 my $female_parent_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'female_parent', 'stock_relationship')->cvterm_id();
1918 my $male_parent_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'male_parent', 'stock_relationship')->cvterm_id();
1920 my $accession_list_string = join ',', @$accession_ids;
1921 my $q = "SELECT accession.stock_id, female_parent.stock_id, male_parent.stock_id
1922 FROM stock AS accession
1923 JOIN stock_relationship AS female_parent_rel ON(accession.stock_id=female_parent_rel.object_id AND female_parent_rel.type_id=$female_parent_cvterm_id)
1924 JOIN stock AS female_parent ON(female_parent_rel.subject_id = female_parent.stock_id AND female_parent.type_id=$accession_cvterm_id)
1925 JOIN stock_relationship AS male_parent_rel ON(accession.stock_id=male_parent_rel.object_id AND male_parent_rel.type_id=$male_parent_cvterm_id)
1926 JOIN stock AS male_parent ON(male_parent_rel.subject_id = male_parent.stock_id AND male_parent.type_id=$accession_cvterm_id)
1927 WHERE accession.stock_id IN ($accession_list_string) AND accession.type_id=$accession_cvterm_id ORDER BY accession.stock_id;";
1928 my $h = $schema->storage->dbh()->prepare($q);
1929 $h->execute();
1930 my @accession_stock_ids_found = ();
1931 my @female_stock_ids_found = ();
1932 my @male_stock_ids_found = ();
1933 while (my ($accession_stock_id, $female_parent_stock_id, $male_parent_stock_id) = $h->fetchrow_array()) {
1934 push @accession_stock_ids_found, $accession_stock_id;
1935 push @female_stock_ids_found, $female_parent_stock_id;
1936 push @male_stock_ids_found, $male_parent_stock_id;
1939 # print STDERR Dumper \@accession_stock_ids_found;
1940 # print STDERR Dumper \@female_stock_ids_found;
1941 # print STDERR Dumper \@male_stock_ids_found;
1943 my $time = DateTime->now();
1944 my $timestamp = $time->ymd()."_".$time->hms();
1946 my @all_protocol_info_lines;
1948 my %unique_germplasm;
1949 my $protocol = CXGN::Genotype::Protocol->new({
1950 bcs_schema => $schema,
1951 nd_protocol_id => $protocol_id,
1952 chromosome_list=>$self->chromosome_list,
1953 start_position=>$self->start_position,
1954 end_position=>$self->end_position,
1955 marker_name_list=>$self->marker_name_list
1957 my $markers = $protocol->markers;
1958 my @all_marker_objects = values %$markers;
1959 push @all_protocol_info_lines, @{$protocol->header_information_lines};
1960 push @all_protocol_info_lines, "##source=FILE GENERATED BY BREEDBASE";
1961 push @all_protocol_info_lines, "##fileDate=$timestamp";
1963 foreach (@all_marker_objects) {
1964 $self->_filtered_markers()->{$_->{name}}++;
1967 no warnings 'uninitialized';
1968 @all_marker_objects = sort { $a->{chrom} cmp $b->{chrom} || $a->{pos} <=> $b->{pos} || $a->{name} cmp $b->{name} } @all_marker_objects;
1969 @all_marker_objects = $self->_check_filtered_markers(\@all_marker_objects);
1971 my $counter = 0;
1972 for my $i (0..scalar(@accession_stock_ids_found)-1) {
1973 my $female_stock_id = $female_stock_ids_found[$i];
1974 my $male_stock_id = $male_stock_ids_found[$i];
1975 my $accession_stock_id = $accession_stock_ids_found[$i];
1977 my $dataset = CXGN::Dataset::Cache->new({
1978 people_schema=>$self->people_schema,
1979 schema=>$schema,
1980 cache_root=>$cache_root_dir,
1981 accessions=>[$female_stock_id, $male_stock_id]
1983 my $genotypes = $dataset->retrieve_genotypes($protocol_id, ['DS'], ['markers'], ['name', 'chrom', 'pos', 'alt', 'ref'], 1, $self->chromosome_list, $self->start_position, $self->end_position, $self->marker_name_list);
1985 # For old protocols with no protocolprop info...
1986 if (scalar(@all_marker_objects) == 0) {
1987 foreach my $o (sort genosort keys %{$genotypes->[0]->{selected_genotype_hash}}) {
1988 push @all_marker_objects, {name => $o};
1990 @all_marker_objects = $self->_check_filtered_markers(\@all_marker_objects);
1993 if (scalar(@$genotypes)>0) {
1994 my $geno = $genotypes->[0];
1995 $unique_germplasm{$accession_stock_id}++;
1997 my $genotype_string = "";
1998 if ($counter == 0) {
1999 $genotype_string .= "#CHROM\t";
2000 foreach my $m (@all_marker_objects) {
2001 my $chrom = $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{chrom};
2002 if (! $chrom) {
2003 ($chrom) = split /\_/, $m->{name};
2005 $genotype_string .= $chrom . "\t";
2007 $genotype_string .= "\n";
2008 $genotype_string .= "POS\t";
2009 foreach my $m (@all_marker_objects) {
2010 my $pos = $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{pos};
2011 if (! $pos) {
2012 (undef, $pos) = split /\_/, $m->{name};
2014 $genotype_string .= $pos . "\t";
2016 $genotype_string .= "\n";
2017 $genotype_string .= "ID\t";
2018 foreach my $m (@all_marker_objects) {
2019 $genotype_string .= $m->{name} . "\t";
2021 $genotype_string .= "\n";
2022 $genotype_string .= "REF\t";
2023 foreach my $m (@all_marker_objects) {
2024 $genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{ref} . "\t";
2026 $genotype_string .= "\n";
2027 $genotype_string .= "ALT\t";
2028 foreach my $m (@all_marker_objects) {
2029 $genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{alt} . "\t";
2031 $genotype_string .= "\n";
2032 $genotype_string .= "QUAL\t";
2033 foreach my $m (@all_marker_objects) {
2034 $genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{qual} . "\t";
2036 $genotype_string .= "\n";
2037 $genotype_string .= "FILTER\t";
2038 foreach my $m (@all_marker_objects) {
2039 $genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{filter} . "\t";
2041 $genotype_string .= "\n";
2042 $genotype_string .= "INFO\t";
2043 foreach my $m (@all_marker_objects) {
2044 $genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{info} . "\t";
2046 $genotype_string .= "\n";
2047 $genotype_string .= "FORMAT\t";
2048 foreach my $m (@all_marker_objects) {
2049 my $format = 'DS'; #When calculating genotypes from parents, only will return dosages atleast for now
2050 $genotype_string .= $format . "\t";
2051 $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{format} = $format;
2052 $m->{format} = $format;
2054 $genotype_string .= "\n";
2056 my $genotype_id = $geno->{germplasmName};
2057 if (!$self->return_only_first_genotypeprop_for_stock) {
2058 $genotype_id = $geno->{germplasmName}."|".$geno->{markerProfileDbId};
2061 my $geno_h = CXGN::Genotype::ComputeHybridGenotype->new({
2062 parental_genotypes=>$genotypes,
2063 marker_objects=>\@all_marker_objects
2065 my $progeny_genotype = $geno_h->get_hybrid_genotype();
2066 my $genotype_string_scores = join "\t", @$progeny_genotype;
2068 $genotype_string .= $genotype_id."\t".$genotype_string_scores."\n";
2070 write_file($tempfile, {append => 1}, $genotype_string);
2071 $counter++;
2075 my $transpose_tempfile = $tempfile . "_transpose";
2077 my $cmd = CXGN::Tools::Run->new(
2079 backend => $backend_config,
2080 submit_host => $cluster_host_config,
2081 temp_base => $tmp_output_dir,
2082 queue => $web_cluster_queue_config,
2083 do_cleanup => 0,
2084 out_file => $transpose_tempfile,
2085 # out_file => $transpose_tempfile,
2086 # don't block and wait if the cluster looks full
2087 max_cluster_jobs => 1_000_000_000,
2091 # Do the transposition job on the cluster
2092 $cmd->run_cluster(
2093 "perl ",
2094 $basepath_config."/bin/transpose_matrix.pl",
2095 $tempfile,
2097 $cmd->is_cluster(1);
2098 $cmd->wait;
2100 my $transpose_tempfile_hdr = $tempfile . "_transpose_hdr";
2102 open my $in, '<', $transpose_tempfile or die "Can't read input file: $!";
2103 open my $out, '>', $transpose_tempfile_hdr or die "Can't write output file: $!";
2105 #Get synonyms of the accessions
2106 my $stocklookup = CXGN::Stock::StockLookup->new({schema => $self->bcs_schema});
2107 my @accession_ids = keys %unique_germplasm;
2108 my $synonym_hash = $stocklookup->get_stock_synonyms('stock_id', 'accession', \@accession_ids);
2109 my $synonym_string = "##SynonymsOfAccessions=\"";
2110 while( my( $uniquename, $synonym_list ) = each %{$synonym_hash}){
2111 if(scalar(@{$synonym_list})>0){
2112 if(not length($synonym_string)<1){
2113 $synonym_string.=" ";
2115 $synonym_string.=$uniquename."=(";
2116 $synonym_string.= (join ", ", @{$synonym_list}).")";
2119 $synonym_string .= "\"";
2120 push @all_protocol_info_lines, $synonym_string;
2122 my $vcf_header = join "\n", @all_protocol_info_lines;
2123 $vcf_header .= "\n";
2125 print $out $vcf_header;
2127 while( <$in> )
2129 print $out $_;
2131 close $in;
2132 close $out;
2134 open my $out_copy, '<', $transpose_tempfile_hdr or die "Can't open output file: $!";
2136 $self->cache()->set($key, '');
2137 $file_handle = $self->cache()->handle($key);
2138 copy($out_copy, $file_handle);
2140 close $out_copy;
2141 $file_handle = $self->cache()->handle($key);
2143 return $file_handle;
2146 sub genosort {
2147 my ($a_chr, $a_pos, $b_chr, $b_pos);
2148 if ($a =~ m/S(\d+)\_(.*)/) {
2149 $a_chr = $1;
2150 $a_pos = $2;
2152 if ($b =~ m/S(\d+)\_(.*)/) {
2153 $b_chr = $1;
2154 $b_pos = $2;
2157 if ($a_chr && $b_chr) {
2158 if ($a_chr == $b_chr) {
2159 return $a_pos <=> $b_pos;
2161 return $a_chr <=> $b_chr;
2162 } else {
2163 return -1;
2167 sub _check_filtered_markers {
2168 my $self = shift;
2169 my $all_marker_objs = shift;
2170 my @all_marker_objects = @$all_marker_objs;
2171 my @filtered_marker_objects;
2172 if ($self->_filtered_markers && scalar(keys %{$self->_filtered_markers}) > 0) {
2173 my $filtered_markers = $self->_filtered_markers;
2174 foreach (@all_marker_objects) {
2175 if (exists($filtered_markers->{$_->{name}})) {
2176 push @filtered_marker_objects, $_;
2179 @all_marker_objects = @filtered_marker_objects;
2181 return @all_marker_objects;
2185 sub get_pcr_genotype_info {
2186 my $self = shift;
2187 my $schema = $self->bcs_schema;
2188 my $genotype_data_project_list = $self->genotype_data_project_list;
2189 my $protocol_id_list = $self->protocol_id_list;
2190 my $genotype_id_list = $self->markerprofile_id_list;
2191 my $protocol_id = $protocol_id_list->[0];
2192 my @where_clause = ();
2193 # print STDERR "GENOTYPE ID =".Dumper($genotype_id_list)."\n";
2194 # print STDERR "PROTOCOL ID =".Dumper($protocol_id_list)."\n";
2196 if ($protocol_id_list && scalar(@$protocol_id_list)>0) {
2197 my $query = join ("," , @$protocol_id_list);
2198 push @where_clause, "nd_protocol.nd_protocol_id in ($query)";
2201 if ($genotype_id_list && scalar(@$genotype_id_list)>0) {
2202 my $query = join ("," , @$genotype_id_list);
2203 push @where_clause, "genotype.genotype_id in ($query)";
2205 # print STDERR "WHERE CLAUSE =".Dumper(\@where_clause)."\n";
2206 my $where_clause = scalar(@where_clause)>0 ? " WHERE " . (join (" AND " , @where_clause)) : '';
2208 my $pcr_genotyping_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'pcr_marker_genotyping', 'genotype_property')->cvterm_id();
2209 my $pcr_protocolprop_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'pcr_marker_details', 'protocol_property')->cvterm_id();
2210 my $pcr_protocol_type_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'pcr_marker_protocol', 'protocol_type')->cvterm_id();
2211 my $ploidy_level_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, "ploidy_level", "stock_property")->cvterm_id();
2213 my $q1 = "SELECT nd_protocolprop.value->>'marker_names' FROM nd_protocolprop WHERE nd_protocol_id = ? AND nd_protocolprop.type_id = ?";
2214 my $h1 = $schema->storage->dbh()->prepare($q1);
2215 $h1->execute($protocol_id, $pcr_protocolprop_cvterm_id);
2216 my $marker_names = $h1->fetchrow_array();
2218 my $q = "SELECT stock.stock_id, stock.uniquename, stockprop.value, cvterm.name, genotype.genotype_id, genotype.description, genotypeprop.value
2219 FROM nd_protocol
2220 JOIN nd_experiment_protocol ON (nd_protocol.nd_protocol_id = nd_experiment_protocol.nd_protocol_id)
2221 JOIN nd_experiment_genotype ON (nd_experiment_protocol.nd_experiment_id = nd_experiment_genotype.nd_experiment_id)
2222 JOIN genotype ON (nd_experiment_genotype.genotype_id = genotype.genotype_id)
2223 JOIN genotypeprop ON (genotype.genotype_id = genotypeprop.genotype_id) AND genotypeprop.type_id = ?
2224 JOIN nd_experiment_stock ON (nd_experiment_genotype.nd_experiment_id = nd_experiment_stock.nd_experiment_id)
2225 JOIN stock ON (nd_experiment_stock.stock_id = stock.stock_id)
2226 LEFT JOIN stockprop ON (stockprop.stock_id = stock.stock_id) AND stockprop.type_id = ?
2227 JOIN cvterm ON (stock.type_id = cvterm.cvterm_id)
2228 $where_clause ORDER BY stock.uniquename ASC";
2230 my $h = $schema->storage->dbh()->prepare($q);
2231 $h->execute($pcr_genotyping_cvterm_id, $ploidy_level_cvterm_id);
2233 my @pcr_genotype_data = ();
2234 while (my ($stock_id, $stock_name, $ploidy_level, $stock_type, $genotype_id, $genotype_description, $genotype_data, $protocol_id) = $h->fetchrow_array()){
2235 push @pcr_genotype_data, [$stock_id, $stock_name, $stock_type, $ploidy_level, $genotype_description, $genotype_id, $genotype_data]
2238 my %protocol_genotype_data = (
2239 marker_names => $marker_names,
2240 protocol_genotype_data => \@pcr_genotype_data
2243 # print STDERR "PCR GENOTYPE INFO =".Dumper(\%protocol_genotype_data)."\n";
2245 return \%protocol_genotype_data;