1 package CXGN
::Genotype
::GRM
;
5 CXGN::Genotype::GRM - an object to handle fetching a GRM for stocks
9 my $geno = CXGN::Genotype::GRM->new({
11 grm_temp_file=>$file_temp_path,
12 people_schema=>$people_schema,
13 accession_id_list=>\@accession_list,
14 plot_id_list=>\@plot_id_list,
15 protocol_id=>$protocol_id,
16 get_grm_for_parental_accessions=>1,
17 cache_root=>$cache_root,
18 download_format=>'matrix', #either 'matrix', 'three_column', or 'heatmap'
19 minor_allele_frequency=>0.01,
21 individuals_filter=>0.8,
24 $geno->download_grm();
28 my $grm = $geno->_get_grm();
35 Nicolas Morales <nm529@cornell.edu>
44 use SGN
::Model
::Cvterm
;
47 use CXGN
::Stock
::Accession
;
48 use CXGN
::Genotype
::Protocol
;
49 use CXGN
::Genotype
::Search
;
50 use CXGN
::Genotype
::ComputeHybridGenotype
;
52 use R
::YapRI
::Data
::Matrix
;
55 use Digest
::MD5 qw
| md5_hex
|;
56 use File
::Slurp qw
| write_file
|;
60 use File
::Temp
'tempfile';
61 use Parallel
::ForkManager
;
64 isa
=> 'Bio::Chado::Schema',
69 has
'people_schema' => (
70 isa
=> 'CXGN::People::Schema',
75 # Uses a cached file system for getting genotype results and getting GRM
82 has
'download_format' => (
93 has
'cache_expiry' => (
96 default => 0, # never expires?
104 has
'grm_temp_file' => (
110 has
'protocol_id' => (
115 has
'minor_allele_frequency' => (
121 has
'marker_filter' => (
127 has
'individuals_filter' => (
133 has
'return_imputed_matrix' => (
139 has
'return_inverse' => (
145 has
'ensure_positive_definite' => (
151 has
'trial_id_list' => (
152 isa
=> 'ArrayRef[Int]|Undef',
156 has
'accession_id_list' => (
157 isa
=> 'ArrayRef[Int]|Undef',
161 has
'accession_name_list' => (
162 isa
=> 'ArrayRef[Str]|Undef',
166 has
'plot_id_list' => (
167 isa
=> 'ArrayRef[Int]|Undef',
171 # If the accessions in the plots you are interested have not been genotyped (as in hybrids), can get this boolean to 1 and give a list of plot_id_list and you will get back a GRM built from the parent accessions for those plots (for the plots whose parents were genotyped)
172 has
'get_grm_for_parental_accessions' => (
178 has
'genotypeprop_hash_select' => (
179 isa
=> 'ArrayRef[Str]',
181 default => sub {['DS']} #THESE ARE THE GENERIC AND EXPECTED VCF ATRRIBUTES. For dosage matrix we only need DS
184 has
'protocolprop_top_key_select' => (
185 isa
=> 'ArrayRef[Str]',
187 default => sub {['markers']} #THESE ARE ALL POSSIBLE TOP LEVEL KEYS IN PROTOCOLPROP BASED ON VCF LOADING. For dosage matrix we only need markers
190 has
'protocolprop_marker_hash_select' => (
191 isa
=> 'ArrayRef[Str]',
193 default => sub {['name']} #THESE ARE ALL POSSIBLE PROTOCOLPROP MARKER HASH KEYS BASED ON VCF LOADING. For dosage matrix we only need name
196 has
'return_only_first_genotypeprop_for_stock' => (
204 my $schema = $self->bcs_schema();
205 my $trial_list = $self->trial_id_list() || [];
206 my $accession_list = $self->accession_id_list() || [];
208 if (scalar(@
$trial_list)>0 && scalar(@
$accession_list)==0) {
209 my %unique_accession_ids;
210 my %unique_accession_names;
211 foreach my $trial_id (@
$trial_list) {
212 my $trial = CXGN
::Trial
->new({
213 bcs_schema
=> $schema,
214 trial_id
=> $trial_id
216 my $accessions = $trial->get_accessions();
217 foreach my $a (@
$accessions) {
218 $unique_accession_ids{$a->{stock_id
}}++;
219 $unique_accession_names{$a->{accession_name
}}++;
222 my @accession_ids = sort keys %unique_accession_ids;
223 my @accession_names = sort keys %unique_accession_names;
224 $self->accession_id_list(\
@accession_ids);
225 $self->accession_name_list(\
@accession_names);
231 my $shared_cluster_dir_config = shift;
232 my $backend_config = shift;
233 my $cluster_host_config = shift;
234 my $web_cluster_queue_config = shift;
235 my $basepath_config = shift;
236 my $schema = $self->bcs_schema();
237 my $people_schema = $self->people_schema();
238 my $cache_root_dir = $self->cache_root();
239 my $accession_list = $self->accession_id_list();
240 my $plot_list = $self->plot_id_list();
241 my $protocol_id = $self->protocol_id();
242 my $get_grm_for_parental_accessions = $self->get_grm_for_parental_accessions();
243 my $grm_tempfile = $self->grm_temp_file();
244 my $return_inverse = $self->return_inverse();
245 my $ensure_positive_definite = $self->ensure_positive_definite();
246 my $return_imputed_matrix = $self->return_imputed_matrix();
248 my $accession_cvterm_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'accession', 'stock_type')->cvterm_id();
249 my $plot_cvterm_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'plot', 'stock_type')->cvterm_id();
250 my $plot_of_cvterm_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'plot_of', 'stock_relationship')->cvterm_id();
251 my $female_parent_cvterm_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'female_parent', 'stock_relationship')->cvterm_id();
252 my $male_parent_cvterm_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'male_parent', 'stock_relationship')->cvterm_id();
254 my $number_system_cores = `getconf _NPROCESSORS_ONLN` or die "Could not get number of system cores!\n";
255 chomp($number_system_cores);
256 print STDERR
"NUMCORES $number_system_cores\n";
258 my $tmp_output_dir = $shared_cluster_dir_config."/tmp_genotype_download_grm";
259 mkdir $tmp_output_dir if ! -d
$tmp_output_dir;
260 my ($grm_tempfile_out_fh, $grm_tempfile_out) = tempfile
("download_grm_out_XXXXX", DIR
=> $tmp_output_dir);
261 my ($grm_tempfile_debug_out_fh, $grm_tempfile_debug_out) = tempfile
("download_grm_out_XXXXX", DIR
=> $tmp_output_dir);
262 my ($grm_tempfile_stock_ids_out_fh, $grm_tempfile_stock_ids_out) = tempfile
("download_grm_out_XXXXX", DIR
=> $tmp_output_dir);
263 my ($grm_imputed_tempfile_out_fh, $grm_imputed_tempfile_out) = tempfile
("download_grm_out_XXXXX", DIR
=> $tmp_output_dir);
264 my ($temp_out_file_fh, $temp_out_file) = tempfile
("download_grm_tmp_XXXXX", DIR
=> $tmp_output_dir);
266 my $max_processes_limit = 10;
267 my $max_processes = ceil
($number_system_cores/4) <= $max_processes_limit ? ceil($number_system_cores/4) : $max_processes_limit;
268 print STDERR
"NUMCORES USE $max_processes\n";
270 my @individuals_stock_ids;
271 my @all_individual_accessions_stock_ids;
274 my $protocol = CXGN
::Genotype
::Protocol
->new({
275 bcs_schema
=> $schema,
276 nd_protocol_id
=> $protocol_id
278 my $markers = $protocol->markers;
279 my @all_marker_objects = values %$markers;
281 no warnings
'uninitialized';
282 @all_marker_objects = sort { $a->{chrom
} <=> $b->{chrom
} || $a->{pos} <=> $b->{pos} || $a->{name
} cmp $b->{name
} } @all_marker_objects;
284 # In this case a list of accessions is given, so get a GRM between these accessions
285 if ($accession_list && scalar(@
$accession_list)>0 && !$get_grm_for_parental_accessions){
286 @all_individual_accessions_stock_ids = @
$accession_list;
288 my $pm = Parallel
::ForkManager
->new($max_processes);
291 foreach (@
$accession_list) {
292 $pm->start and next LINKS
;
294 my $dataset = CXGN
::Dataset
->new({
295 people_schema
=>$people_schema,
297 cache_root
=>$cache_root_dir,
300 my $genotypes = $dataset->retrieve_genotypes($protocol_id, ['DS'], ['markers'], ['name'], 1, [], undef, undef, []);
302 if (scalar(@
$genotypes)>0) {
303 my $p1_markers = $genotypes->[0]->{selected_protocol_hash
}->{markers
};
305 # For old genotyping protocols without nd_protocolprop info...
306 if (scalar(@all_marker_objects) == 0) {
307 foreach my $o (sort genosort
keys %{$genotypes->[0]->{selected_genotype_hash
}}) {
308 push @all_marker_objects, {name
=> $o};
312 foreach my $p (0..scalar(@
$genotypes)-1) {
313 my $stock_id = $genotypes->[$p]->{stock_id
};
314 my $genotype_string = "";
315 my @row = ($stock_id);
316 foreach my $m (@all_marker_objects) {
317 push @row, $genotypes->[$p]->{selected_genotype_hash
}->{$m->{name
}}->{'DS'};
319 my $genotype_string_scores = join "\t", @row;
320 $genotype_string .= $genotype_string_scores . "\n";
321 write_file
($grm_tempfile, {append
=> 1}, $genotype_string);
322 undef $genotypes->[$p];
330 $pm->wait_all_children;
332 # IN this case of a hybrid evaluation where the parents of the accessions planted in a plot are genotyped
333 elsif ($get_grm_for_parental_accessions && $plot_list && scalar(@
$plot_list)>0) {
334 print STDERR
"COMPUTING GENOTYPE FROM PARENTS FOR PLOTS\n";
335 my $plot_list_string = join ',', @
$plot_list;
336 my $q = "SELECT plot.stock_id, accession.stock_id, female_parent.stock_id, male_parent.stock_id
338 JOIN stock_relationship AS plot_acc_rel ON(plot_acc_rel.subject_id=plot.stock_id AND plot_acc_rel.type_id=$plot_of_cvterm_id)
339 JOIN stock AS accession ON(plot_acc_rel.object_id=accession.stock_id AND accession.type_id=$accession_cvterm_id)
340 JOIN stock_relationship AS female_parent_rel ON(accession.stock_id=female_parent_rel.object_id AND female_parent_rel.type_id=$female_parent_cvterm_id)
341 JOIN stock AS female_parent ON(female_parent_rel.subject_id = female_parent.stock_id AND female_parent.type_id=$accession_cvterm_id)
342 JOIN stock_relationship AS male_parent_rel ON(accession.stock_id=male_parent_rel.object_id AND male_parent_rel.type_id=$male_parent_cvterm_id)
343 JOIN stock AS male_parent ON(male_parent_rel.subject_id = male_parent.stock_id AND male_parent.type_id=$accession_cvterm_id)
344 WHERE plot.type_id=$plot_cvterm_id AND plot.stock_id IN ($plot_list_string);";
345 my $h = $schema->storage->dbh()->prepare($q);
347 my @plot_stock_ids_found = ();
348 my @plot_accession_stock_ids_found = ();
349 my @plot_female_stock_ids_found = ();
350 my @plot_male_stock_ids_found = ();
351 while (my ($plot_stock_id, $accession_stock_id, $female_parent_stock_id, $male_parent_stock_id) = $h->fetchrow_array()) {
352 push @plot_stock_ids_found, $plot_stock_id;
353 push @plot_accession_stock_ids_found, $accession_stock_id;
354 push @plot_female_stock_ids_found, $female_parent_stock_id;
355 push @plot_male_stock_ids_found, $male_parent_stock_id;
358 @all_individual_accessions_stock_ids = @plot_accession_stock_ids_found;
360 # print STDERR Dumper \@plot_stock_ids_found;
361 # print STDERR Dumper \@plot_female_stock_ids_found;
362 # print STDERR Dumper \@plot_male_stock_ids_found;
364 my $pm = Parallel
::ForkManager
->new($max_processes);
367 for my $i (0..scalar(@plot_stock_ids_found)-1) {
368 $pm->start and next LINKS
;
370 my $female_stock_id = $plot_female_stock_ids_found[$i];
371 my $male_stock_id = $plot_male_stock_ids_found[$i];
372 my $plot_stock_id = $plot_stock_ids_found[$i];
374 my $dataset = CXGN
::Dataset
->new({
375 people_schema
=>$people_schema,
377 cache_root
=>$cache_root_dir,
378 accessions
=>[$female_stock_id, $male_stock_id]
380 my $genotypes = $dataset->retrieve_genotypes($protocol_id, ['DS'], ['markers'], ['name'], 1, [], undef, undef, []);
382 if (scalar(@
$genotypes) > 0) {
383 # For old genotyping protocols without nd_protocolprop info...
384 if (scalar(@all_marker_objects) == 0) {
385 foreach my $o (sort genosort
keys %{$genotypes->[0]->{selected_genotype_hash
}}) {
386 push @all_marker_objects, {name
=> $o};
390 my $genotype_string = "";
391 my $geno = CXGN
::Genotype
::ComputeHybridGenotype
->new({
392 parental_genotypes
=>$genotypes,
393 marker_objects
=>\
@all_marker_objects
395 my $progeny_genotype = $geno->get_hybrid_genotype();
397 unshift @
$progeny_genotype, $plot_stock_id;
398 my $genotype_string_scores = join "\t", @
$progeny_genotype;
399 $genotype_string .= $genotype_string_scores . "\n";
400 write_file
($grm_tempfile, {append
=> 1}, $genotype_string);
401 undef $progeny_genotype;
407 $pm->wait_all_children;
409 # IN this case of a hybrid evaluation where the parents of the accessions planted in a plot are genotyped
410 elsif ($get_grm_for_parental_accessions && $accession_list && scalar(@
$accession_list)>0) {
411 print STDERR
"COMPUTING GENOTYPE FROM PARENTS FOR ACCESSIONS\n";
412 my $accession_list_string = join ',', @
$accession_list;
413 my $q = "SELECT accession.stock_id, female_parent.stock_id, male_parent.stock_id
414 FROM stock AS accession
415 JOIN stock_relationship AS female_parent_rel ON(accession.stock_id=female_parent_rel.object_id AND female_parent_rel.type_id=$female_parent_cvterm_id)
416 JOIN stock AS female_parent ON(female_parent_rel.subject_id = female_parent.stock_id AND female_parent.type_id=$accession_cvterm_id)
417 JOIN stock_relationship AS male_parent_rel ON(accession.stock_id=male_parent_rel.object_id AND male_parent_rel.type_id=$male_parent_cvterm_id)
418 JOIN stock AS male_parent ON(male_parent_rel.subject_id = male_parent.stock_id AND male_parent.type_id=$accession_cvterm_id)
419 WHERE accession.type_id=$accession_cvterm_id AND accession.stock_id IN ($accession_list_string);";
420 my $h = $schema->storage->dbh()->prepare($q);
422 my @accession_stock_ids_found = ();
423 my @female_stock_ids_found = ();
424 my @male_stock_ids_found = ();
425 while (my ($accession_stock_id, $female_parent_stock_id, $male_parent_stock_id) = $h->fetchrow_array()) {
426 push @accession_stock_ids_found, $accession_stock_id;
427 push @female_stock_ids_found, $female_parent_stock_id;
428 push @male_stock_ids_found, $male_parent_stock_id;
431 # print STDERR Dumper \@accession_stock_ids_found;
432 # print STDERR Dumper \@female_stock_ids_found;
433 # print STDERR Dumper \@male_stock_ids_found;
435 @all_individual_accessions_stock_ids = @
$accession_list;
437 my $pm = Parallel
::ForkManager
->new($max_processes);
440 for my $i (0..scalar(@accession_stock_ids_found)-1) {
441 $pm->start and next LINKS
;
443 my $female_stock_id = $female_stock_ids_found[$i];
444 my $male_stock_id = $male_stock_ids_found[$i];
445 my $accession_stock_id = $accession_stock_ids_found[$i];
447 my $dataset = CXGN
::Dataset
->new({
448 people_schema
=>$people_schema,
450 cache_root
=>$cache_root_dir,
451 accessions
=>[$female_stock_id, $male_stock_id]
453 my $genotypes = $dataset->retrieve_genotypes($protocol_id, ['DS'], ['markers'], ['name'], 1, [], undef, undef, []);
455 if (scalar(@
$genotypes) > 0) {
456 # For old genotyping protocols without nd_protocolprop info...
457 if (scalar(@all_marker_objects) == 0) {
458 foreach my $o (sort genosort
keys %{$genotypes->[0]->{selected_genotype_hash
}}) {
459 push @all_marker_objects, {name
=> $o};
463 my $genotype_string = "";
464 my $geno = CXGN
::Genotype
::ComputeHybridGenotype
->new({
465 parental_genotypes
=>$genotypes,
466 marker_objects
=>\
@all_marker_objects
468 my $progeny_genotype = $geno->get_hybrid_genotype();
470 unshift @
$progeny_genotype, $accession_stock_id;
471 my $genotype_string_scores = join "\t", @
$progeny_genotype;
472 $genotype_string .= $genotype_string_scores . "\n";
473 write_file
($grm_tempfile, {append
=> 1}, $genotype_string);
474 undef $progeny_genotype;
480 $pm->wait_all_children;
483 # print STDERR Dumper \@all_marker_names;
484 # print STDERR Dumper \@individuals_stock_ids;
485 # print STDERR Dumper \@dosage_matrix;
487 my $maf = $self->minor_allele_frequency();
488 my $marker_filter = $self->marker_filter();
489 my $individuals_filter = $self->individuals_filter();
491 my $cmd = 'R -e "library(rrBLUP); library(data.table); library(scales);
492 mat_full <- fread(\''.$grm_tempfile.'\', header=FALSE, sep=\'\t\');
493 if (ncol(mat_full)<1) { opt <- options(show.error.messages = FALSE); on.exit(options(opt)); stop(); }
494 mat <- mat_full[,-1];
495 stock_ids <- mat_full[,1];
496 write.table(stock_ids, file=\''.$grm_tempfile_stock_ids_out.'\', row.names=FALSE, col.names=FALSE, sep=\'\t\');
497 range_check <- range(as.matrix(mat)[1,]);
498 if (length(table(as.matrix(mat)[1,])) < 2 || (!is.na(range_check[1]) && !is.na(range_check[2]) && range_check[2] - range_check[1] <= 1 )) {
499 mat <- as.data.frame(rescale(as.matrix(mat), to = c(-1,1), from = c(0,2) ));
501 mat <- as.data.frame(rescale(as.matrix(mat), to = c(-1,1) ));
504 #if (!$get_grm_for_parental_accessions) {
505 #strange behavior in filterGenoData during testing... will use A.mat filters instead in this case...
506 # $cmd .= 'mat_clean <- filterGenoData(gData=mat, maf='.$maf.', markerFilter='.$marker_filter.', indFilter='.$individuals_filter.');
507 # A_matrix <- A.mat(mat_clean, impute.method=\'EM\', n.core='.$number_system_cores.', return.imputed=FALSE);
511 if (!$return_imputed_matrix) {
512 $cmd .= 'A <- A.mat(mat, min.MAF='.$maf.', max.missing='.$marker_filter.', impute.method=\'mean\', n.core='.$number_system_cores.', return.imputed=FALSE);
513 write.table(A, file=\''.$grm_tempfile_debug_out.'\', row.names=FALSE, col.names=FALSE, sep=\'\t\');
517 $cmd .= 'A_list <- A.mat(mat, min.MAF='.$maf.', max.missing='.$marker_filter.', impute.method=\'mean\', n.core='.$number_system_cores.', return.imputed=TRUE);
519 imputed <- A_list\$imputed;
520 write.table(imputed, file=\''.$grm_imputed_tempfile_out.'\', row.names=TRUE, col.names=TRUE, sep=\'\t\');
521 write.table(A, file=\''.$grm_tempfile_debug_out.'\', row.names=FALSE, col.names=FALSE, sep=\'\t\');
525 if ($ensure_positive_definite) {
526 # Ensure positive definite matrix. Taken from Schaeffer
527 $cmd .= 'A_o <- A; E <- eigen(A);
536 B = sum(ev[nev])*2.0;
539 ev[nev] = p*(B-val)*(B-val)/wr;
540 A = U%*%diag(ev)%*%t(U);
547 if ($return_inverse) {
548 $cmd .= 'A <- solve(A);';
550 $cmd .= 'write.table(A, file=\''.$grm_tempfile_out.'\', row.names=FALSE, col.names=FALSE, sep=\'\t\');"';
551 print STDERR Dumper
$cmd;
552 my $status = system($cmd);
554 # Do the GRM on the cluster
555 # my $grm_cmd = CXGN::Tools::Run->new(
557 # backend => $backend_config,
558 # submit_host => $cluster_host_config,
559 # temp_base => $tmp_output_dir,
560 # queue => $web_cluster_queue_config,
562 # out_file => $temp_out_file,
563 # # don't block and wait if the cluster looks full
564 # max_cluster_jobs => 1_000_000_000,
568 # $grm_cmd->run_cluster($cmd);
569 # $grm_cmd->is_cluster(1);
572 open(my $fh_stock_ids, "<", $grm_tempfile_stock_ids_out) or die "Can't open < $grm_tempfile_stock_ids_out: $!";
573 while (my $stock_id = <$fh_stock_ids>) {
575 push @individuals_stock_ids, $stock_id;
577 close($fh_stock_ids);
580 print STDERR
"No protocol, so giving equal relationship of all stocks!!\n";
581 my $number_of_stocks = 0;
582 my $stock_id_string = '';
583 if ($accession_list && scalar(@
$accession_list)) {
584 @
$accession_list = sort {$a <=> $b} @
$accession_list;
585 $number_of_stocks = scalar(@
$accession_list);
586 @individuals_stock_ids = @
$accession_list;
587 @all_individual_accessions_stock_ids = @
$accession_list;
589 elsif ($plot_list && scalar(@
$plot_list)) {
590 my $plot_list_string = join ',', @
$plot_list;
591 my $q = "SELECT plot.stock_id, accession.stock_id
593 JOIN stock_relationship AS plot_acc_rel ON(plot_acc_rel.subject_id=plot.stock_id AND plot_acc_rel.type_id=$plot_of_cvterm_id)
594 JOIN stock AS accession ON(plot_acc_rel.object_id=accession.stock_id AND accession.type_id=$accession_cvterm_id)
595 WHERE plot.type_id=$plot_cvterm_id AND plot.stock_id IN ($plot_list_string);";
596 my $h = $schema->storage->dbh()->prepare($q);
598 my %plot_accession_ids;
599 while (my ($plot_stock_id, $accession_stock_id) = $h->fetchrow_array()) {
600 $plot_accession_ids{$accession_stock_id}++;
602 my @seen_accession_ids = sort {$a <=> $b} keys %plot_accession_ids;
603 $number_of_stocks = scalar(@seen_accession_ids);
604 @individuals_stock_ids = @seen_accession_ids;
605 @all_individual_accessions_stock_ids = @seen_accession_ids;
608 A <- as.data.frame(diag('.$number_of_stocks.'));
609 write.table(A, file=\''.$grm_tempfile_out.'\', row.names=FALSE, col.names=FALSE, sep=\'\t\');
611 print STDERR Dumper
$cmd;
612 my $status = system($cmd);
615 return ($grm_tempfile_out, \
@individuals_stock_ids, \
@all_individual_accessions_stock_ids);
620 my $datatype = shift;
622 #print STDERR Dumper($self->_get_dataref());
623 my $json = JSON
->new();
624 #preserve order of hash keys to get same text
625 $json = $json->canonical();
626 my $sorted_accession_list = $self->accession_id_list() || [];
627 my @sorted_accession_list = sort @
$sorted_accession_list;
628 my $accessions = $json->encode( \
@sorted_accession_list );
629 my $plots = $json->encode( $self->plot_id_list() || [] );
630 my $protocol = $self->protocol_id() || '';
631 my $genotypeprophash = $json->encode( $self->genotypeprop_hash_select() || [] );
632 my $protocolprophash = $json->encode( $self->protocolprop_top_key_select() || [] );
633 my $protocolpropmarkerhash = $json->encode( $self->protocolprop_marker_hash_select() || [] );
634 my $maf = $self->minor_allele_frequency();
635 my $marker_filter = $self->marker_filter();
636 my $individuals_filter = $self->individuals_filter();
637 my $return_imputed_matrix = $self->return_imputed_matrix();
638 my $return_imputed_matrix_key = $return_imputed_matrix ?
'_returnimputed' : '';
640 my $q_params = $accessions.$plots.$protocol.$genotypeprophash.$protocolprophash.$protocolpropmarkerhash.$self->get_grm_for_parental_accessions().$self->return_only_first_genotypeprop_for_stock()."_MAF$maf"."_mfilter$marker_filter"."_ifilter$individuals_filter"."_$datatype";
641 if ($self->return_inverse()) {
642 $q_params .= $self->return_inverse();
644 if (!$self->ensure_positive_definite()) {
645 $q_params .= $self->ensure_positive_definite();
647 my $key = md5_hex
($q_params);
653 my $return_type = shift || 'filehandle';
654 my $shared_cluster_dir_config = shift;
655 my $backend_config = shift;
656 my $cluster_host_config = shift;
657 my $web_cluster_queue_config = shift;
658 my $basepath_config = shift;
659 my $download_format = $self->download_format();
660 my $grm_tempfile = $self->grm_temp_file();
661 my $schema = $self->bcs_schema();
662 my $protocol_id = $self->protocol_id();
663 my $accession_list = $self->accession_id_list();
664 my $plot_list = $self->plot_id_list();
666 print STDERR
"$download_format\n";
667 my $version_string = "download_grm_v04";
668 my $target_key = $version_string.$download_format;
670 my $matrix_key = $self->grm_cache_key($version_string.'matrix');
671 my $matrix_uniquenames_key = $self->grm_cache_key($version_string.'matrix_uniquenames');
672 my $three_column_key = $self->grm_cache_key($version_string.'three_column');
673 my $three_column_uniquenames_key = $self->grm_cache_key($version_string.'three_column_uniquenames');
674 my $three_column_stock_id_integer_key = $self->grm_cache_key($version_string.'three_column_stock_id_integer');
675 my $three_column_reciprocal_key = $self->grm_cache_key($version_string.'three_column_reciprocal');
676 my $three_column_reciprocal_uniquenames_key = $self->grm_cache_key($version_string.'three_column_reciprocal_uniquenames');
677 my $three_column_reciprocal_stock_id_integer_key = $self->grm_cache_key($version_string.'three_column_reciprocal_stock_id_integer');
678 my $heatmap_key = $self->grm_cache_key($version_string.'heatmap');
680 my $key = $self->grm_cache_key($target_key);
681 $self->cache( Cache
::File
->new( cache_root
=> $self->cache_root() ));
684 if ($self->cache()->exists($key)) {
685 print STDERR
"DOWNLOAD GRM CACHED\n";
686 if ($return_type eq 'filehandle') {
687 $return = $self->cache()->handle($key);
689 elsif ($return_type eq 'data') {
690 $return = $self->cache()->get($key);
694 print STDERR
"DOWNLOAD GRM\n";
696 my ($grm_tempfile_out, $stock_ids, $all_accession_stock_ids) = $self->_get_grm($shared_cluster_dir_config, $backend_config, $cluster_host_config, $web_cluster_queue_config, $basepath_config);
700 open(my $fh, "<", $grm_tempfile_out) or die "Can't open < $grm_tempfile_out: $!";
701 while (my $row = <$fh>) {
703 my @vals = split "\t", $row;
705 my $a_stock_id = $stock_ids->[$row_num];
707 foreach my $val (@vals) {
708 my $b_stock_id = $stock_ids->[$col_num];
709 $grm_hash{$a_stock_id}->{$b_stock_id} = $val;
710 $grm_hash{$b_stock_id}->{$a_stock_id} = $val;
716 # print STDERR Dumper \%grm_hash;
717 # print STDERR Dumper $all_accession_stock_ids;
720 my $q = "SELECT stock.uniquename, stock.stock_id
722 WHERE stock.is_obsolete = 'F';";
723 my $h = $schema->storage->dbh()->prepare($q);
725 while (my ($uniquename, $stock_id) = $h->fetchrow_array()) {
726 $all_names{$stock_id} = $uniquename;
729 my $matrix_data = '';
730 my @matrix_header = ("stock_id");
732 my $matrix_uniquenames_data = '';
733 my @matrix_uniquenames_header = ("stock_uniquename");
735 foreach (@
$all_accession_stock_ids) {
736 my $s1 = $all_names{$_};
737 push @matrix_header, "S".$_;
738 push @matrix_uniquenames_header, $s1;
741 my $matrix_header_line = join "\t", @matrix_header;
742 $matrix_data = "$matrix_header_line\n";
744 my $matrix_uniquenames_header_line = join "\t", @matrix_uniquenames_header;
745 $matrix_uniquenames_data = "$matrix_uniquenames_header_line\n";
747 foreach my $s (@
$all_accession_stock_ids) {
748 my $s2 = $all_names{$s};
749 my @matrix_row = ("S".$s);
750 my @matrix_uniquenames_row = ($s2);
751 foreach my $c (@
$all_accession_stock_ids) {
753 if ($row_num > 0 && defined($grm_hash{$s}->{$c})) {
754 $val = $grm_hash{$s}->{$c};
763 push @matrix_row, $val;
764 push @matrix_uniquenames_row, $val;
766 my $matrix_line = join "\t", @matrix_row;
767 my $matrix_uniquenames_line = join "\t", @matrix_uniquenames_row;
768 $matrix_data .= "$matrix_line\n";
769 $matrix_uniquenames_data .= "$matrix_uniquenames_line\n";
772 $self->cache()->set($matrix_key, $matrix_data);
773 print STDERR
"CACHED GRM MATRIX\n";
775 $self->cache()->set($matrix_uniquenames_key, $matrix_uniquenames_data);
776 print STDERR
"CACHED GRM MATRIX UNIQUENAMES\n";
779 my %three_column_result_hash;
780 my $three_column_data = '';
781 my $three_column_stock_id_integer_data = '';
782 my $three_column_uniquenames_data = '';
783 foreach my $s (@
$all_accession_stock_ids) {
784 foreach my $c (@
$all_accession_stock_ids) {
785 if (!exists($three_column_result_hash{$s}->{$c}) && !exists($three_column_result_hash{$c}->{$s})) {
787 if ($row_num > 0 && defined($grm_hash{$s}->{$c})) {
788 $val = $grm_hash{$s}->{$c};
797 $three_column_data .= "S$s\tS$c\t$val\n";
798 $three_column_stock_id_integer_data .= "$s\t$c\t$val\n";
800 my $s1 = $all_names{$s};
801 my $s2 = $all_names{$c};
802 $three_column_uniquenames_data .= "$s1\t$s2\t$val\n";
804 $three_column_result_hash{$s}->{$c} = $val;
809 $self->cache()->set($three_column_key, $three_column_data);
810 print STDERR
"CACHED GRM THREE COLUMN\n";
812 $self->cache()->set($three_column_stock_id_integer_key, $three_column_stock_id_integer_data);
813 print STDERR
"CACHED GRM THREE COLUMN STOCK ID\n";
815 $self->cache()->set($three_column_uniquenames_key, $three_column_uniquenames_data);
816 print STDERR
"CACHED GRM THREE COLUMN UNIQUENAMES\n";
819 my $three_column_reciprocal_data = '';
820 my $three_column_reciprocal_stock_id_integer_data = '';
821 my $three_column_reciprocal_uniquenames_data = '';
822 foreach my $s (@
$all_accession_stock_ids) {
823 foreach my $c (@
$all_accession_stock_ids) {
825 if ($row_num > 0 && defined($grm_hash{$s}->{$c})) {
826 $val = $grm_hash{$s}->{$c};
835 $three_column_reciprocal_data .= "S$s\tS$c\t$val\n";
836 $three_column_reciprocal_stock_id_integer_data .= "$s\t$c\t$val\n";
838 my $s1 = $all_names{$s};
839 my $s2 = $all_names{$c};
840 $three_column_reciprocal_uniquenames_data .= "$s1\t$s2\t$val\n";
844 $self->cache()->set($three_column_reciprocal_key, $three_column_reciprocal_data);
845 print STDERR
"CACHED GRM THREE COLUMN Reciprocal\n";
847 $self->cache()->set($three_column_reciprocal_uniquenames_key, $three_column_reciprocal_uniquenames_data);
848 print STDERR
"CACHED GRM THREE COLUMN Reciprocal UNIQUENAMES\n";
850 $self->cache()->set($three_column_reciprocal_stock_id_integer_key, $three_column_reciprocal_stock_id_integer_data);
851 print STDERR
"CACHED GRM THREE COLUMN Reciprocal STOCK IDS\n";
854 my $grm_tempfile_heatmap_data = $grm_tempfile . "_heatmap_data";
855 open(my $heatmap_fh, '>', $grm_tempfile_heatmap_data) or die $!;
856 print $heatmap_fh $three_column_reciprocal_uniquenames_data;
859 my $grm_tempfile_heatmap_out = $grm_tempfile . "_plot_out";
860 my $heatmap_cmd = 'R -e "library(ggplot2); library(data.table); library(viridis); library(GGally); library(gridExtra);
861 mat <- fread(\''.$grm_tempfile_heatmap_data.'\', header=FALSE, sep=\'\t\', stringsAsFactors=FALSE);
862 gg <- ggplot(mat, aes(V1, V2, fill=V3)) +
864 scale_fill_viridis(discrete=FALSE);
865 ggsave(\''.$grm_tempfile_heatmap_out.'\', gg, device=\'pdf\', width=8.5, height=11, units=\'in\');
867 print STDERR Dumper
$heatmap_cmd;
868 my $status_heatmap = system($heatmap_cmd);
870 open my $out_copy, '<', $grm_tempfile_heatmap_out or die "Can't open output file: $!";
871 $self->cache()->set($heatmap_key, '');
872 my $file_handle = $self->cache()->handle($heatmap_key);
873 copy
($out_copy, $file_handle);
876 print STDERR
"CACHED GRM HEATMAP\n";
878 if ($return_type eq 'filehandle') {
879 $return = $self->cache()->handle($key);
881 elsif ($return_type eq 'data') {
883 if ($download_format eq 'matrix') {
884 $data = $matrix_data;
886 elsif ($download_format eq 'matrix_uniquenames') {
887 $data = $matrix_uniquenames_data;
889 elsif ($download_format eq 'three_column') {
890 $data = $three_column_data;
892 elsif ($download_format eq 'three_column_uniquenames') {
893 $data = $three_column_uniquenames_data;
895 elsif ($download_format eq 'three_column_stock_id_integer') {
896 $data = $three_column_stock_id_integer_data;
898 elsif ($download_format eq 'three_column_reciprocal') {
899 $data = $three_column_reciprocal_data;
901 elsif ($download_format eq 'three_column_reciprocal_uniquenames') {
902 $data = $three_column_reciprocal_uniquenames_data;
904 elsif ($download_format eq 'three_column_reciprocal_stock_id_integer') {
905 $data = $three_column_reciprocal_stock_id_integer_data;
907 elsif ($download_format eq 'heatmap') {
908 die "Can only return the filehandle (Not Data) for GRM heatmap!\n";
917 my ($a_chr, $a_pos, $b_chr, $b_pos);
918 if ($a =~ m/S(\d+)\_(.*)/) {
922 if ($b =~ m/S(\d+)\_(.*)/) {
927 if ($a_chr && $b_chr) {
928 if ($a_chr == $b_chr) {
929 return $a_pos <=> $b_pos;
931 return $a_chr <=> $b_chr;