start fixing test for multi cat phenotype upload.
[sgn.git] / lib / CXGN / Genotype / GRM.pm
blob5a14ba4c7ba624284076dca394a7a7e6caaa6f3d
1 package CXGN::Genotype::GRM;
3 =head1 NAME
5 CXGN::Genotype::GRM - an object to handle fetching a GRM for stocks
7 =head1 USAGE
9 my $geno = CXGN::Genotype::GRM->new({
10 bcs_schema=>$schema,
11 grm_temp_file=>$file_temp_path,
12 people_schema=>$people_schema,
13 accession_id_list=>\@accession_list,
14 plot_id_list=>\@plot_id_list,
15 protocol_id=>$protocol_id,
16 get_grm_for_parental_accessions=>1,
17 cache_root=>$cache_root,
18 download_format=>'matrix', #either 'matrix', 'three_column', or 'heatmap'
19 minor_allele_frequency=>0.01,
20 marker_filter=>0.6,
21 individuals_filter=>0.8,
22 });
23 RECOMMENDED
24 $geno->download_grm();
28 my $grm = $geno->_get_grm();
30 =head1 DESCRIPTION
33 =head1 AUTHORS
35 Nicolas Morales <nm529@cornell.edu>
37 =cut
39 use strict;
40 use warnings;
41 use Moose;
42 use Try::Tiny;
43 use Data::Dumper;
44 use SGN::Model::Cvterm;
45 use CXGN::Trial;
46 use JSON;
47 use CXGN::Stock::Accession;
48 use CXGN::Genotype::Protocol;
49 use CXGN::Genotype::Search;
50 use CXGN::Genotype::ComputeHybridGenotype;
51 use R::YapRI::Base;
52 use R::YapRI::Data::Matrix;
53 use CXGN::Dataset;
54 use Cache::File;
55 use Digest::MD5 qw | md5_hex |;
56 use File::Slurp qw | write_file |;
57 use POSIX;
58 use File::Copy;
59 use CXGN::Tools::Run;
60 use File::Temp 'tempfile';
61 use Parallel::ForkManager;
63 has 'bcs_schema' => (
64 isa => 'Bio::Chado::Schema',
65 is => 'rw',
66 required => 1
69 has 'people_schema' => (
70 isa => 'CXGN::People::Schema',
71 is => 'rw',
72 required => 1
75 # Uses a cached file system for getting genotype results and getting GRM
76 has 'cache_root' => (
77 isa => 'Str',
78 is => 'rw',
79 required => 1
82 has 'download_format' => (
83 isa => 'Str',
84 is => 'rw',
85 required => 1
88 has 'cache' => (
89 isa => 'Cache::File',
90 is => 'rw',
93 has 'cache_expiry' => (
94 isa => 'Int',
95 is => 'rw',
96 default => 0, # never expires?
99 has '_cache_key' => (
100 isa => 'Str',
101 is => 'rw',
104 has 'grm_temp_file' => (
105 isa => 'Str',
106 is => 'rw',
107 required => 1
110 has 'protocol_id' => (
111 isa => 'Int|Undef',
112 is => 'rw',
115 has 'minor_allele_frequency' => (
116 isa => 'Num',
117 is => 'rw',
118 default => sub{0.05}
121 has 'marker_filter' => (
122 isa => 'Num',
123 is => 'rw',
124 default => sub{0.60}
127 has 'individuals_filter' => (
128 isa => 'Num',
129 is => 'rw',
130 default => sub{0.80}
133 has 'return_imputed_matrix' => (
134 isa => 'Bool',
135 is => 'ro',
136 default => 0
139 has 'return_inverse' => (
140 isa => 'Bool',
141 is => 'ro',
142 default => 0
145 has 'ensure_positive_definite' => (
146 isa => 'Bool',
147 is => 'ro',
148 default => 1
151 has 'trial_id_list' => (
152 isa => 'ArrayRef[Int]|Undef',
153 is => 'rw'
156 has 'accession_id_list' => (
157 isa => 'ArrayRef[Int]|Undef',
158 is => 'rw'
161 has 'accession_name_list' => (
162 isa => 'ArrayRef[Str]|Undef',
163 is => 'rw'
166 has 'plot_id_list' => (
167 isa => 'ArrayRef[Int]|Undef',
168 is => 'rw'
171 # If the accessions in the plots you are interested have not been genotyped (as in hybrids), can get this boolean to 1 and give a list of plot_id_list and you will get back a GRM built from the parent accessions for those plots (for the plots whose parents were genotyped)
172 has 'get_grm_for_parental_accessions' => (
173 isa => 'Bool',
174 is => 'ro',
175 default => 0
178 has 'genotypeprop_hash_select' => (
179 isa => 'ArrayRef[Str]',
180 is => 'ro',
181 default => sub {['DS']} #THESE ARE THE GENERIC AND EXPECTED VCF ATRRIBUTES. For dosage matrix we only need DS
184 has 'protocolprop_top_key_select' => (
185 isa => 'ArrayRef[Str]',
186 is => 'ro',
187 default => sub {['markers']} #THESE ARE ALL POSSIBLE TOP LEVEL KEYS IN PROTOCOLPROP BASED ON VCF LOADING. For dosage matrix we only need markers
190 has 'protocolprop_marker_hash_select' => (
191 isa => 'ArrayRef[Str]',
192 is => 'ro',
193 default => sub {['name']} #THESE ARE ALL POSSIBLE PROTOCOLPROP MARKER HASH KEYS BASED ON VCF LOADING. For dosage matrix we only need name
196 has 'return_only_first_genotypeprop_for_stock' => (
197 isa => 'Bool',
198 is => 'ro',
199 default => 1
202 sub BUILD {
203 my $self = shift;
204 my $schema = $self->bcs_schema();
205 my $trial_list = $self->trial_id_list() || [];
206 my $accession_list = $self->accession_id_list() || [];
208 if (scalar(@$trial_list)>0 && scalar(@$accession_list)==0) {
209 my %unique_accession_ids;
210 my %unique_accession_names;
211 foreach my $trial_id (@$trial_list) {
212 my $trial = CXGN::Trial->new({
213 bcs_schema => $schema,
214 trial_id => $trial_id
216 my $accessions = $trial->get_accessions();
217 foreach my $a (@$accessions) {
218 $unique_accession_ids{$a->{stock_id}}++;
219 $unique_accession_names{$a->{accession_name}}++;
222 my @accession_ids = sort keys %unique_accession_ids;
223 my @accession_names = sort keys %unique_accession_names;
224 $self->accession_id_list(\@accession_ids);
225 $self->accession_name_list(\@accession_names);
229 sub _get_grm {
230 my $self = shift;
231 my $shared_cluster_dir_config = shift;
232 my $backend_config = shift;
233 my $cluster_host_config = shift;
234 my $web_cluster_queue_config = shift;
235 my $basepath_config = shift;
236 my $schema = $self->bcs_schema();
237 my $people_schema = $self->people_schema();
238 my $cache_root_dir = $self->cache_root();
239 my $accession_list = $self->accession_id_list();
240 my $plot_list = $self->plot_id_list();
241 my $protocol_id = $self->protocol_id();
242 my $get_grm_for_parental_accessions = $self->get_grm_for_parental_accessions();
243 my $grm_tempfile = $self->grm_temp_file();
244 my $return_inverse = $self->return_inverse();
245 my $ensure_positive_definite = $self->ensure_positive_definite();
246 my $return_imputed_matrix = $self->return_imputed_matrix();
248 my $accession_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'accession', 'stock_type')->cvterm_id();
249 my $plot_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'plot', 'stock_type')->cvterm_id();
250 my $plot_of_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'plot_of', 'stock_relationship')->cvterm_id();
251 my $female_parent_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'female_parent', 'stock_relationship')->cvterm_id();
252 my $male_parent_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'male_parent', 'stock_relationship')->cvterm_id();
254 my $number_system_cores = `getconf _NPROCESSORS_ONLN` or die "Could not get number of system cores!\n";
255 chomp($number_system_cores);
256 print STDERR "NUMCORES $number_system_cores\n";
258 my $tmp_output_dir = $shared_cluster_dir_config."/tmp_genotype_download_grm";
259 mkdir $tmp_output_dir if ! -d $tmp_output_dir;
260 my ($grm_tempfile_out_fh, $grm_tempfile_out) = tempfile("download_grm_out_XXXXX", DIR=> $tmp_output_dir);
261 my ($grm_tempfile_debug_out_fh, $grm_tempfile_debug_out) = tempfile("download_grm_out_XXXXX", DIR=> $tmp_output_dir);
262 my ($grm_tempfile_stock_ids_out_fh, $grm_tempfile_stock_ids_out) = tempfile("download_grm_out_XXXXX", DIR=> $tmp_output_dir);
263 my ($grm_imputed_tempfile_out_fh, $grm_imputed_tempfile_out) = tempfile("download_grm_out_XXXXX", DIR=> $tmp_output_dir);
264 my ($temp_out_file_fh, $temp_out_file) = tempfile("download_grm_tmp_XXXXX", DIR=> $tmp_output_dir);
266 my $max_processes_limit = 10;
267 my $max_processes = ceil($number_system_cores/4) <= $max_processes_limit ? ceil($number_system_cores/4) : $max_processes_limit;
268 print STDERR "NUMCORES USE $max_processes\n";
270 my @individuals_stock_ids;
271 my @all_individual_accessions_stock_ids;
273 if ($protocol_id) {
274 my $protocol = CXGN::Genotype::Protocol->new({
275 bcs_schema => $schema,
276 nd_protocol_id => $protocol_id
278 my $markers = $protocol->markers;
279 my @all_marker_objects = values %$markers;
281 no warnings 'uninitialized';
282 @all_marker_objects = sort { $a->{chrom} <=> $b->{chrom} || $a->{pos} <=> $b->{pos} || $a->{name} cmp $b->{name} } @all_marker_objects;
284 # In this case a list of accessions is given, so get a GRM between these accessions
285 if ($accession_list && scalar(@$accession_list)>0 && !$get_grm_for_parental_accessions){
286 @all_individual_accessions_stock_ids = @$accession_list;
288 my $pm = Parallel::ForkManager->new($max_processes);
290 LINKS:
291 foreach (@$accession_list) {
292 $pm->start and next LINKS;
294 my $dataset = CXGN::Dataset->new({
295 people_schema=>$people_schema,
296 schema=>$schema,
297 cache_root=>$cache_root_dir,
298 accessions=>[$_]
300 my $genotypes = $dataset->retrieve_genotypes($protocol_id, ['DS'], ['markers'], ['name'], 1, [], undef, undef, []);
302 if (scalar(@$genotypes)>0) {
303 my $p1_markers = $genotypes->[0]->{selected_protocol_hash}->{markers};
305 # For old genotyping protocols without nd_protocolprop info...
306 if (scalar(@all_marker_objects) == 0) {
307 foreach my $o (sort genosort keys %{$genotypes->[0]->{selected_genotype_hash}}) {
308 push @all_marker_objects, {name => $o};
312 foreach my $p (0..scalar(@$genotypes)-1) {
313 my $stock_id = $genotypes->[$p]->{stock_id};
314 my $genotype_string = "";
315 my @row = ($stock_id);
316 foreach my $m (@all_marker_objects) {
317 push @row, $genotypes->[$p]->{selected_genotype_hash}->{$m->{name}}->{'DS'};
319 my $genotype_string_scores = join "\t", @row;
320 $genotype_string .= $genotype_string_scores . "\n";
321 write_file($grm_tempfile, {append => 1}, $genotype_string);
322 undef $genotypes->[$p];
324 undef $genotypes;
327 $pm->finish;
330 $pm->wait_all_children;
332 # IN this case of a hybrid evaluation where the parents of the accessions planted in a plot are genotyped
333 elsif ($get_grm_for_parental_accessions && $plot_list && scalar(@$plot_list)>0) {
334 print STDERR "COMPUTING GENOTYPE FROM PARENTS FOR PLOTS\n";
335 my $plot_list_string = join ',', @$plot_list;
336 my $q = "SELECT plot.stock_id, accession.stock_id, female_parent.stock_id, male_parent.stock_id
337 FROM stock AS plot
338 JOIN stock_relationship AS plot_acc_rel ON(plot_acc_rel.subject_id=plot.stock_id AND plot_acc_rel.type_id=$plot_of_cvterm_id)
339 JOIN stock AS accession ON(plot_acc_rel.object_id=accession.stock_id AND accession.type_id=$accession_cvterm_id)
340 JOIN stock_relationship AS female_parent_rel ON(accession.stock_id=female_parent_rel.object_id AND female_parent_rel.type_id=$female_parent_cvterm_id)
341 JOIN stock AS female_parent ON(female_parent_rel.subject_id = female_parent.stock_id AND female_parent.type_id=$accession_cvterm_id)
342 JOIN stock_relationship AS male_parent_rel ON(accession.stock_id=male_parent_rel.object_id AND male_parent_rel.type_id=$male_parent_cvterm_id)
343 JOIN stock AS male_parent ON(male_parent_rel.subject_id = male_parent.stock_id AND male_parent.type_id=$accession_cvterm_id)
344 WHERE plot.type_id=$plot_cvterm_id AND plot.stock_id IN ($plot_list_string);";
345 my $h = $schema->storage->dbh()->prepare($q);
346 $h->execute();
347 my @plot_stock_ids_found = ();
348 my @plot_accession_stock_ids_found = ();
349 my @plot_female_stock_ids_found = ();
350 my @plot_male_stock_ids_found = ();
351 while (my ($plot_stock_id, $accession_stock_id, $female_parent_stock_id, $male_parent_stock_id) = $h->fetchrow_array()) {
352 push @plot_stock_ids_found, $plot_stock_id;
353 push @plot_accession_stock_ids_found, $accession_stock_id;
354 push @plot_female_stock_ids_found, $female_parent_stock_id;
355 push @plot_male_stock_ids_found, $male_parent_stock_id;
358 @all_individual_accessions_stock_ids = @plot_accession_stock_ids_found;
360 # print STDERR Dumper \@plot_stock_ids_found;
361 # print STDERR Dumper \@plot_female_stock_ids_found;
362 # print STDERR Dumper \@plot_male_stock_ids_found;
364 my $pm = Parallel::ForkManager->new($max_processes);
366 LINKS:
367 for my $i (0..scalar(@plot_stock_ids_found)-1) {
368 $pm->start and next LINKS;
370 my $female_stock_id = $plot_female_stock_ids_found[$i];
371 my $male_stock_id = $plot_male_stock_ids_found[$i];
372 my $plot_stock_id = $plot_stock_ids_found[$i];
374 my $dataset = CXGN::Dataset->new({
375 people_schema=>$people_schema,
376 schema=>$schema,
377 cache_root=>$cache_root_dir,
378 accessions=>[$female_stock_id, $male_stock_id]
380 my $genotypes = $dataset->retrieve_genotypes($protocol_id, ['DS'], ['markers'], ['name'], 1, [], undef, undef, []);
382 if (scalar(@$genotypes) > 0) {
383 # For old genotyping protocols without nd_protocolprop info...
384 if (scalar(@all_marker_objects) == 0) {
385 foreach my $o (sort genosort keys %{$genotypes->[0]->{selected_genotype_hash}}) {
386 push @all_marker_objects, {name => $o};
390 my $genotype_string = "";
391 my $geno = CXGN::Genotype::ComputeHybridGenotype->new({
392 parental_genotypes=>$genotypes,
393 marker_objects=>\@all_marker_objects
395 my $progeny_genotype = $geno->get_hybrid_genotype();
397 unshift @$progeny_genotype, $plot_stock_id;
398 my $genotype_string_scores = join "\t", @$progeny_genotype;
399 $genotype_string .= $genotype_string_scores . "\n";
400 write_file($grm_tempfile, {append => 1}, $genotype_string);
401 undef $progeny_genotype;
404 $pm->finish;
407 $pm->wait_all_children;
409 # IN this case of a hybrid evaluation where the parents of the accessions planted in a plot are genotyped
410 elsif ($get_grm_for_parental_accessions && $accession_list && scalar(@$accession_list)>0) {
411 print STDERR "COMPUTING GENOTYPE FROM PARENTS FOR ACCESSIONS\n";
412 my $accession_list_string = join ',', @$accession_list;
413 my $q = "SELECT accession.stock_id, female_parent.stock_id, male_parent.stock_id
414 FROM stock AS accession
415 JOIN stock_relationship AS female_parent_rel ON(accession.stock_id=female_parent_rel.object_id AND female_parent_rel.type_id=$female_parent_cvterm_id)
416 JOIN stock AS female_parent ON(female_parent_rel.subject_id = female_parent.stock_id AND female_parent.type_id=$accession_cvterm_id)
417 JOIN stock_relationship AS male_parent_rel ON(accession.stock_id=male_parent_rel.object_id AND male_parent_rel.type_id=$male_parent_cvterm_id)
418 JOIN stock AS male_parent ON(male_parent_rel.subject_id = male_parent.stock_id AND male_parent.type_id=$accession_cvterm_id)
419 WHERE accession.type_id=$accession_cvterm_id AND accession.stock_id IN ($accession_list_string);";
420 my $h = $schema->storage->dbh()->prepare($q);
421 $h->execute();
422 my @accession_stock_ids_found = ();
423 my @female_stock_ids_found = ();
424 my @male_stock_ids_found = ();
425 while (my ($accession_stock_id, $female_parent_stock_id, $male_parent_stock_id) = $h->fetchrow_array()) {
426 push @accession_stock_ids_found, $accession_stock_id;
427 push @female_stock_ids_found, $female_parent_stock_id;
428 push @male_stock_ids_found, $male_parent_stock_id;
431 # print STDERR Dumper \@accession_stock_ids_found;
432 # print STDERR Dumper \@female_stock_ids_found;
433 # print STDERR Dumper \@male_stock_ids_found;
435 @all_individual_accessions_stock_ids = @$accession_list;
437 my $pm = Parallel::ForkManager->new($max_processes);
439 LINKS:
440 for my $i (0..scalar(@accession_stock_ids_found)-1) {
441 $pm->start and next LINKS;
443 my $female_stock_id = $female_stock_ids_found[$i];
444 my $male_stock_id = $male_stock_ids_found[$i];
445 my $accession_stock_id = $accession_stock_ids_found[$i];
447 my $dataset = CXGN::Dataset->new({
448 people_schema=>$people_schema,
449 schema=>$schema,
450 cache_root=>$cache_root_dir,
451 accessions=>[$female_stock_id, $male_stock_id]
453 my $genotypes = $dataset->retrieve_genotypes($protocol_id, ['DS'], ['markers'], ['name'], 1, [], undef, undef, []);
455 if (scalar(@$genotypes) > 0) {
456 # For old genotyping protocols without nd_protocolprop info...
457 if (scalar(@all_marker_objects) == 0) {
458 foreach my $o (sort genosort keys %{$genotypes->[0]->{selected_genotype_hash}}) {
459 push @all_marker_objects, {name => $o};
463 my $genotype_string = "";
464 my $geno = CXGN::Genotype::ComputeHybridGenotype->new({
465 parental_genotypes=>$genotypes,
466 marker_objects=>\@all_marker_objects
468 my $progeny_genotype = $geno->get_hybrid_genotype();
470 unshift @$progeny_genotype, $accession_stock_id;
471 my $genotype_string_scores = join "\t", @$progeny_genotype;
472 $genotype_string .= $genotype_string_scores . "\n";
473 write_file($grm_tempfile, {append => 1}, $genotype_string);
474 undef $progeny_genotype;
477 $pm->finish;
480 $pm->wait_all_children;
483 # print STDERR Dumper \@all_marker_names;
484 # print STDERR Dumper \@individuals_stock_ids;
485 # print STDERR Dumper \@dosage_matrix;
487 my $maf = $self->minor_allele_frequency();
488 my $marker_filter = $self->marker_filter();
489 my $individuals_filter = $self->individuals_filter();
491 my $cmd = 'R -e "library(rrBLUP); library(data.table); library(scales);
492 mat_full <- fread(\''.$grm_tempfile.'\', header=FALSE, sep=\'\t\');
493 if (ncol(mat_full)<1) { opt <- options(show.error.messages = FALSE); on.exit(options(opt)); stop(); }
494 mat <- mat_full[,-1];
495 stock_ids <- mat_full[,1];
496 write.table(stock_ids, file=\''.$grm_tempfile_stock_ids_out.'\', row.names=FALSE, col.names=FALSE, sep=\'\t\');
497 range_check <- range(as.matrix(mat)[1,]);
498 if (length(table(as.matrix(mat)[1,])) < 2 || (!is.na(range_check[1]) && !is.na(range_check[2]) && range_check[2] - range_check[1] <= 1 )) {
499 mat <- as.data.frame(rescale(as.matrix(mat), to = c(-1,1), from = c(0,2) ));
500 } else {
501 mat <- as.data.frame(rescale(as.matrix(mat), to = c(-1,1) ));
504 #if (!$get_grm_for_parental_accessions) {
505 #strange behavior in filterGenoData during testing... will use A.mat filters instead in this case...
506 # $cmd .= 'mat_clean <- filterGenoData(gData=mat, maf='.$maf.', markerFilter='.$marker_filter.', indFilter='.$individuals_filter.');
507 # A_matrix <- A.mat(mat_clean, impute.method=\'EM\', n.core='.$number_system_cores.', return.imputed=FALSE);
508 # ';
510 #else {
511 if (!$return_imputed_matrix) {
512 $cmd .= 'A <- A.mat(mat, min.MAF='.$maf.', max.missing='.$marker_filter.', impute.method=\'mean\', n.core='.$number_system_cores.', return.imputed=FALSE);
513 write.table(A, file=\''.$grm_tempfile_debug_out.'\', row.names=FALSE, col.names=FALSE, sep=\'\t\');
516 else {
517 $cmd .= 'A_list <- A.mat(mat, min.MAF='.$maf.', max.missing='.$marker_filter.', impute.method=\'mean\', n.core='.$number_system_cores.', return.imputed=TRUE);
518 A <- A_list\$A;
519 imputed <- A_list\$imputed;
520 write.table(imputed, file=\''.$grm_imputed_tempfile_out.'\', row.names=TRUE, col.names=TRUE, sep=\'\t\');
521 write.table(A, file=\''.$grm_tempfile_debug_out.'\', row.names=FALSE, col.names=FALSE, sep=\'\t\');
525 if ($ensure_positive_definite) {
526 # Ensure positive definite matrix. Taken from Schaeffer
527 $cmd .= 'A_o <- A; E <- eigen(A);
528 ev = E\$values;
529 U = E\$vectors;
530 no = dim(A)[1];
531 nev = which(ev < 0);
532 wr = 0;
533 k=length(nev);
534 if(k > 0){
535 p = ev[no - k];
536 B = sum(ev[nev])*2.0;
537 wr = (B*B*100.0)+1;
538 val = ev[nev];
539 ev[nev] = p*(B-val)*(B-val)/wr;
540 A = U%*%diag(ev)%*%t(U);
542 if (max(A)>=60) {
543 A <- A_o;
547 if ($return_inverse) {
548 $cmd .= 'A <- solve(A);';
550 $cmd .= 'write.table(A, file=\''.$grm_tempfile_out.'\', row.names=FALSE, col.names=FALSE, sep=\'\t\');"';
551 print STDERR Dumper $cmd;
552 my $status = system($cmd);
554 # Do the GRM on the cluster
555 # my $grm_cmd = CXGN::Tools::Run->new(
557 # backend => $backend_config,
558 # submit_host => $cluster_host_config,
559 # temp_base => $tmp_output_dir,
560 # queue => $web_cluster_queue_config,
561 # do_cleanup => 0,
562 # out_file => $temp_out_file,
563 # # don't block and wait if the cluster looks full
564 # max_cluster_jobs => 1_000_000_000,
566 # );
568 # $grm_cmd->run_cluster($cmd);
569 # $grm_cmd->is_cluster(1);
570 # $grm_cmd->wait;
572 open(my $fh_stock_ids, "<", $grm_tempfile_stock_ids_out) or die "Can't open < $grm_tempfile_stock_ids_out: $!";
573 while (my $stock_id = <$fh_stock_ids>) {
574 chomp($stock_id);
575 push @individuals_stock_ids, $stock_id;
577 close($fh_stock_ids);
579 else {
580 print STDERR "No protocol, so giving equal relationship of all stocks!!\n";
581 my $number_of_stocks = 0;
582 my $stock_id_string = '';
583 if ($accession_list && scalar(@$accession_list)) {
584 @$accession_list = sort {$a <=> $b} @$accession_list;
585 $number_of_stocks = scalar(@$accession_list);
586 @individuals_stock_ids = @$accession_list;
587 @all_individual_accessions_stock_ids = @$accession_list;
589 elsif ($plot_list && scalar(@$plot_list)) {
590 my $plot_list_string = join ',', @$plot_list;
591 my $q = "SELECT plot.stock_id, accession.stock_id
592 FROM stock AS plot
593 JOIN stock_relationship AS plot_acc_rel ON(plot_acc_rel.subject_id=plot.stock_id AND plot_acc_rel.type_id=$plot_of_cvterm_id)
594 JOIN stock AS accession ON(plot_acc_rel.object_id=accession.stock_id AND accession.type_id=$accession_cvterm_id)
595 WHERE plot.type_id=$plot_cvterm_id AND plot.stock_id IN ($plot_list_string);";
596 my $h = $schema->storage->dbh()->prepare($q);
597 $h->execute();
598 my %plot_accession_ids;
599 while (my ($plot_stock_id, $accession_stock_id) = $h->fetchrow_array()) {
600 $plot_accession_ids{$accession_stock_id}++;
602 my @seen_accession_ids = sort {$a <=> $b} keys %plot_accession_ids;
603 $number_of_stocks = scalar(@seen_accession_ids);
604 @individuals_stock_ids = @seen_accession_ids;
605 @all_individual_accessions_stock_ids = @seen_accession_ids;
607 my $cmd .= 'R -e "
608 A <- as.data.frame(diag('.$number_of_stocks.'));
609 write.table(A, file=\''.$grm_tempfile_out.'\', row.names=FALSE, col.names=FALSE, sep=\'\t\');
611 print STDERR Dumper $cmd;
612 my $status = system($cmd);
615 return ($grm_tempfile_out, \@individuals_stock_ids, \@all_individual_accessions_stock_ids);
618 sub grm_cache_key {
619 my $self = shift;
620 my $datatype = shift;
622 #print STDERR Dumper($self->_get_dataref());
623 my $json = JSON->new();
624 #preserve order of hash keys to get same text
625 $json = $json->canonical();
626 my $sorted_accession_list = $self->accession_id_list() || [];
627 my @sorted_accession_list = sort @$sorted_accession_list;
628 my $accessions = $json->encode( \@sorted_accession_list );
629 my $plots = $json->encode( $self->plot_id_list() || [] );
630 my $protocol = $self->protocol_id() || '';
631 my $genotypeprophash = $json->encode( $self->genotypeprop_hash_select() || [] );
632 my $protocolprophash = $json->encode( $self->protocolprop_top_key_select() || [] );
633 my $protocolpropmarkerhash = $json->encode( $self->protocolprop_marker_hash_select() || [] );
634 my $maf = $self->minor_allele_frequency();
635 my $marker_filter = $self->marker_filter();
636 my $individuals_filter = $self->individuals_filter();
637 my $return_imputed_matrix = $self->return_imputed_matrix();
638 my $return_imputed_matrix_key = $return_imputed_matrix ? '_returnimputed' : '';
640 my $q_params = $accessions.$plots.$protocol.$genotypeprophash.$protocolprophash.$protocolpropmarkerhash.$self->get_grm_for_parental_accessions().$self->return_only_first_genotypeprop_for_stock()."_MAF$maf"."_mfilter$marker_filter"."_ifilter$individuals_filter"."_$datatype";
641 if ($self->return_inverse()) {
642 $q_params .= $self->return_inverse();
644 if (!$self->ensure_positive_definite()) {
645 $q_params .= $self->ensure_positive_definite();
647 my $key = md5_hex($q_params);
648 return $key;
651 sub download_grm {
652 my $self = shift;
653 my $return_type = shift || 'filehandle';
654 my $shared_cluster_dir_config = shift;
655 my $backend_config = shift;
656 my $cluster_host_config = shift;
657 my $web_cluster_queue_config = shift;
658 my $basepath_config = shift;
659 my $download_format = $self->download_format();
660 my $grm_tempfile = $self->grm_temp_file();
661 my $schema = $self->bcs_schema();
662 my $protocol_id = $self->protocol_id();
663 my $accession_list = $self->accession_id_list();
664 my $plot_list = $self->plot_id_list();
666 print STDERR "$download_format\n";
667 my $version_string = "download_grm_v04";
668 my $target_key = $version_string.$download_format;
670 my $matrix_key = $self->grm_cache_key($version_string.'matrix');
671 my $matrix_uniquenames_key = $self->grm_cache_key($version_string.'matrix_uniquenames');
672 my $three_column_key = $self->grm_cache_key($version_string.'three_column');
673 my $three_column_uniquenames_key = $self->grm_cache_key($version_string.'three_column_uniquenames');
674 my $three_column_stock_id_integer_key = $self->grm_cache_key($version_string.'three_column_stock_id_integer');
675 my $three_column_reciprocal_key = $self->grm_cache_key($version_string.'three_column_reciprocal');
676 my $three_column_reciprocal_uniquenames_key = $self->grm_cache_key($version_string.'three_column_reciprocal_uniquenames');
677 my $three_column_reciprocal_stock_id_integer_key = $self->grm_cache_key($version_string.'three_column_reciprocal_stock_id_integer');
678 my $heatmap_key = $self->grm_cache_key($version_string.'heatmap');
680 my $key = $self->grm_cache_key($target_key);
681 $self->cache( Cache::File->new( cache_root => $self->cache_root() ));
683 my $return;
684 if ($self->cache()->exists($key)) {
685 print STDERR "DOWNLOAD GRM CACHED\n";
686 if ($return_type eq 'filehandle') {
687 $return = $self->cache()->handle($key);
689 elsif ($return_type eq 'data') {
690 $return = $self->cache()->get($key);
693 else {
694 print STDERR "DOWNLOAD GRM\n";
696 my ($grm_tempfile_out, $stock_ids, $all_accession_stock_ids) = $self->_get_grm($shared_cluster_dir_config, $backend_config, $cluster_host_config, $web_cluster_queue_config, $basepath_config);
698 my %grm_hash;
699 my $row_num = 0;
700 open(my $fh, "<", $grm_tempfile_out) or die "Can't open < $grm_tempfile_out: $!";
701 while (my $row = <$fh>) {
702 chomp($row);
703 my @vals = split "\t", $row;
705 my $a_stock_id = $stock_ids->[$row_num];
706 my $col_num = 0;
707 foreach my $val (@vals) {
708 my $b_stock_id = $stock_ids->[$col_num];
709 $grm_hash{$a_stock_id}->{$b_stock_id} = $val;
710 $grm_hash{$b_stock_id}->{$a_stock_id} = $val;
711 $col_num++;
713 $row_num++;
715 close($fh);
716 # print STDERR Dumper \%grm_hash;
717 # print STDERR Dumper $all_accession_stock_ids;
719 my %all_names;
720 my $q = "SELECT stock.uniquename, stock.stock_id
721 FROM stock
722 WHERE stock.is_obsolete = 'F';";
723 my $h = $schema->storage->dbh()->prepare($q);
724 $h->execute();
725 while (my ($uniquename, $stock_id) = $h->fetchrow_array()) {
726 $all_names{$stock_id} = $uniquename;
729 my $matrix_data = '';
730 my @matrix_header = ("stock_id");
732 my $matrix_uniquenames_data = '';
733 my @matrix_uniquenames_header = ("stock_uniquename");
735 foreach (@$all_accession_stock_ids) {
736 my $s1 = $all_names{$_};
737 push @matrix_header, "S".$_;
738 push @matrix_uniquenames_header, $s1;
741 my $matrix_header_line = join "\t", @matrix_header;
742 $matrix_data = "$matrix_header_line\n";
744 my $matrix_uniquenames_header_line = join "\t", @matrix_uniquenames_header;
745 $matrix_uniquenames_data = "$matrix_uniquenames_header_line\n";
747 foreach my $s (@$all_accession_stock_ids) {
748 my $s2 = $all_names{$s};
749 my @matrix_row = ("S".$s);
750 my @matrix_uniquenames_row = ($s2);
751 foreach my $c (@$all_accession_stock_ids) {
752 my $val;
753 if ($row_num > 0 && defined($grm_hash{$s}->{$c})) {
754 $val = $grm_hash{$s}->{$c};
756 elsif ($s == $c) {
757 $val = 1;
759 else {
760 $val = 0;
763 push @matrix_row, $val;
764 push @matrix_uniquenames_row, $val;
766 my $matrix_line = join "\t", @matrix_row;
767 my $matrix_uniquenames_line = join "\t", @matrix_uniquenames_row;
768 $matrix_data .= "$matrix_line\n";
769 $matrix_uniquenames_data .= "$matrix_uniquenames_line\n";
772 $self->cache()->set($matrix_key, $matrix_data);
773 print STDERR "CACHED GRM MATRIX\n";
774 sleep(2);
775 $self->cache()->set($matrix_uniquenames_key, $matrix_uniquenames_data);
776 print STDERR "CACHED GRM MATRIX UNIQUENAMES\n";
777 sleep(2);
779 my %three_column_result_hash;
780 my $three_column_data = '';
781 my $three_column_stock_id_integer_data = '';
782 my $three_column_uniquenames_data = '';
783 foreach my $s (@$all_accession_stock_ids) {
784 foreach my $c (@$all_accession_stock_ids) {
785 if (!exists($three_column_result_hash{$s}->{$c}) && !exists($three_column_result_hash{$c}->{$s})) {
786 my $val;
787 if ($row_num > 0 && defined($grm_hash{$s}->{$c})) {
788 $val = $grm_hash{$s}->{$c};
790 elsif ($s == $c) {
791 $val = 1;
793 else {
794 $val = 0;
797 $three_column_data .= "S$s\tS$c\t$val\n";
798 $three_column_stock_id_integer_data .= "$s\t$c\t$val\n";
800 my $s1 = $all_names{$s};
801 my $s2 = $all_names{$c};
802 $three_column_uniquenames_data .= "$s1\t$s2\t$val\n";
804 $three_column_result_hash{$s}->{$c} = $val;
809 $self->cache()->set($three_column_key, $three_column_data);
810 print STDERR "CACHED GRM THREE COLUMN\n";
811 sleep(2);
812 $self->cache()->set($three_column_stock_id_integer_key, $three_column_stock_id_integer_data);
813 print STDERR "CACHED GRM THREE COLUMN STOCK ID\n";
814 sleep(2);
815 $self->cache()->set($three_column_uniquenames_key, $three_column_uniquenames_data);
816 print STDERR "CACHED GRM THREE COLUMN UNIQUENAMES\n";
817 sleep(2);
819 my $three_column_reciprocal_data = '';
820 my $three_column_reciprocal_stock_id_integer_data = '';
821 my $three_column_reciprocal_uniquenames_data = '';
822 foreach my $s (@$all_accession_stock_ids) {
823 foreach my $c (@$all_accession_stock_ids) {
824 my $val;
825 if ($row_num > 0 && defined($grm_hash{$s}->{$c})) {
826 $val = $grm_hash{$s}->{$c};
828 elsif ($s == $c) {
829 $val = 1;
831 else {
832 $val = 0;
835 $three_column_reciprocal_data .= "S$s\tS$c\t$val\n";
836 $three_column_reciprocal_stock_id_integer_data .= "$s\t$c\t$val\n";
838 my $s1 = $all_names{$s};
839 my $s2 = $all_names{$c};
840 $three_column_reciprocal_uniquenames_data .= "$s1\t$s2\t$val\n";
844 $self->cache()->set($three_column_reciprocal_key, $three_column_reciprocal_data);
845 print STDERR "CACHED GRM THREE COLUMN Reciprocal\n";
846 sleep(2);
847 $self->cache()->set($three_column_reciprocal_uniquenames_key, $three_column_reciprocal_uniquenames_data);
848 print STDERR "CACHED GRM THREE COLUMN Reciprocal UNIQUENAMES\n";
849 sleep(2);
850 $self->cache()->set($three_column_reciprocal_stock_id_integer_key, $three_column_reciprocal_stock_id_integer_data);
851 print STDERR "CACHED GRM THREE COLUMN Reciprocal STOCK IDS\n";
852 sleep(2);
854 my $grm_tempfile_heatmap_data = $grm_tempfile . "_heatmap_data";
855 open(my $heatmap_fh, '>', $grm_tempfile_heatmap_data) or die $!;
856 print $heatmap_fh $three_column_reciprocal_uniquenames_data;
857 close($heatmap_fh);
859 my $grm_tempfile_heatmap_out = $grm_tempfile . "_plot_out";
860 my $heatmap_cmd = 'R -e "library(ggplot2); library(data.table); library(viridis); library(GGally); library(gridExtra);
861 mat <- fread(\''.$grm_tempfile_heatmap_data.'\', header=FALSE, sep=\'\t\', stringsAsFactors=FALSE);
862 gg <- ggplot(mat, aes(V1, V2, fill=V3)) +
863 geom_tile() +
864 scale_fill_viridis(discrete=FALSE);
865 ggsave(\''.$grm_tempfile_heatmap_out.'\', gg, device=\'pdf\', width=8.5, height=11, units=\'in\');
867 print STDERR Dumper $heatmap_cmd;
868 my $status_heatmap = system($heatmap_cmd);
870 open my $out_copy, '<', $grm_tempfile_heatmap_out or die "Can't open output file: $!";
871 $self->cache()->set($heatmap_key, '');
872 my $file_handle = $self->cache()->handle($heatmap_key);
873 copy($out_copy, $file_handle);
874 sleep(5);
875 close $out_copy;
876 print STDERR "CACHED GRM HEATMAP\n";
878 if ($return_type eq 'filehandle') {
879 $return = $self->cache()->handle($key);
881 elsif ($return_type eq 'data') {
882 my $data = '';
883 if ($download_format eq 'matrix') {
884 $data = $matrix_data;
886 elsif ($download_format eq 'matrix_uniquenames') {
887 $data = $matrix_uniquenames_data;
889 elsif ($download_format eq 'three_column') {
890 $data = $three_column_data;
892 elsif ($download_format eq 'three_column_uniquenames') {
893 $data = $three_column_uniquenames_data;
895 elsif ($download_format eq 'three_column_stock_id_integer') {
896 $data = $three_column_stock_id_integer_data;
898 elsif ($download_format eq 'three_column_reciprocal') {
899 $data = $three_column_reciprocal_data;
901 elsif ($download_format eq 'three_column_reciprocal_uniquenames') {
902 $data = $three_column_reciprocal_uniquenames_data;
904 elsif ($download_format eq 'three_column_reciprocal_stock_id_integer') {
905 $data = $three_column_reciprocal_stock_id_integer_data;
907 elsif ($download_format eq 'heatmap') {
908 die "Can only return the filehandle (Not Data) for GRM heatmap!\n";
910 $return = $data;
913 return $return;
916 sub genosort {
917 my ($a_chr, $a_pos, $b_chr, $b_pos);
918 if ($a =~ m/S(\d+)\_(.*)/) {
919 $a_chr = $1;
920 $a_pos = $2;
922 if ($b =~ m/S(\d+)\_(.*)/) {
923 $b_chr = $1;
924 $b_pos = $2;
927 if ($a_chr && $b_chr) {
928 if ($a_chr == $b_chr) {
929 return $a_pos <=> $b_pos;
931 return $a_chr <=> $b_chr;
932 } else {
933 return -1;