Merge pull request #5243 from solgenomics/topic/observations_upload_catch_error
[sgn.git] / lib / CXGN / Pedigree / ARM.pm
blob406dd09204152e5141eaea911d595e8e4fc196bb
1 package CXGN::Pedigree::ARM;
3 =head1 NAME
5 CXGN::Genotype::GRM - an object to handle fetching an additive relationship matrix (ARM) from pedigrees for stocks
7 =head1 USAGE
9 my $pedigree_arm = CXGN::Pedigree::ARM->new({
10 bcs_schema=>$schema,
11 arm_temp_file=>$file_temp_path,
12 people_schema=>$people_schema,
13 accession_id_list=>\@accession_list,
14 plot_id_list=>\@plot_id_list,
15 cache_root=>$cache_root,
16 download_format=>'matrix', #either 'matrix', 'three_column', or 'heatmap'
17 });
18 RECOMMENDED
19 $pedigree_arm->download_arm();
23 my $arm = $pedigree_arm->get_arm();
25 =head1 DESCRIPTION
28 =head1 AUTHORS
30 Nicolas Morales <nm529@cornell.edu>
32 =cut
34 use strict;
35 use warnings;
36 use Moose;
37 use Try::Tiny;
38 use Data::Dumper;
39 use SGN::Model::Cvterm;
40 use CXGN::Trial;
41 use JSON;
42 use CXGN::Stock::Accession;
43 use CXGN::Genotype::Protocol;
44 use CXGN::Genotype::Search;
45 use CXGN::Genotype::ComputeHybridGenotype;
46 use R::YapRI::Base;
47 use R::YapRI::Data::Matrix;
48 use CXGN::Dataset::Cache;
49 use Cache::File;
50 use Digest::MD5 qw | md5_hex |;
51 use File::Slurp qw | write_file |;
52 use POSIX;
53 use File::Copy;
54 use CXGN::Tools::Run;
55 use File::Temp 'tempfile';
57 has 'bcs_schema' => (
58 isa => 'Bio::Chado::Schema',
59 is => 'rw',
60 required => 1
63 has 'people_schema' => (
64 isa => 'CXGN::People::Schema',
65 is => 'rw',
66 required => 1
69 # Uses a cached file system for getting genotype results and getting GRM
70 has 'cache_root' => (
71 isa => 'Str',
72 is => 'rw',
73 required => 1
76 has 'download_format' => (
77 isa => 'Str',
78 is => 'rw',
79 required => 1
82 has 'cache' => (
83 isa => 'Cache::File',
84 is => 'rw',
87 has 'cache_expiry' => (
88 isa => 'Int',
89 is => 'rw',
90 default => 0, # never expires?
93 has '_cache_key' => (
94 isa => 'Str',
95 is => 'rw',
98 has 'arm_temp_file' => (
99 isa => 'Str',
100 is => 'rw',
101 required => 1
104 has 'accession_id_list' => (
105 isa => 'ArrayRef[Int]|Undef',
106 is => 'rw'
109 has 'plot_id_list' => (
110 isa => 'ArrayRef[Int]|Undef',
111 is => 'rw'
114 sub get_arm {
115 my $self = shift;
116 my $shared_cluster_dir_config = shift;
117 my $backend_config = shift;
118 my $cluster_host_config = shift;
119 my $web_cluster_queue_config = shift;
120 my $basepath_config = shift;
121 my $schema = $self->bcs_schema();
122 my $people_schema = $self->people_schema();
123 my $cache_root_dir = $self->cache_root();
124 my $accession_list = $self->accession_id_list();
125 my $plot_list = $self->plot_id_list();
127 my $accession_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'accession', 'stock_type')->cvterm_id();
128 my $plot_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'plot', 'stock_type')->cvterm_id();
129 my $plot_of_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'plot_of', 'stock_relationship')->cvterm_id();
130 my $female_parent_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'female_parent', 'stock_relationship')->cvterm_id();
131 my $male_parent_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'male_parent', 'stock_relationship')->cvterm_id();
133 my $number_system_cores = `getconf _NPROCESSORS_ONLN` or die "Could not get number of system cores!\n";
134 chomp($number_system_cores);
135 print STDERR "NUMCORES $number_system_cores\n";
137 my @individuals_stock_ids;
138 my @all_individual_accessions_stock_ids;
139 my %parent_hash;
140 my %seen_female_parents;
141 my %seen_male_parents;
143 if ($plot_list && scalar(@$plot_list)>0) {
144 print STDERR "COMPUTING ARM FROM PARENTS FOR PLOTS\n";
145 my $plot_list_string = join ',', @$plot_list;
146 my $q = "SELECT plot.stock_id, accession.stock_id, female_parent.stock_id, male_parent.stock_id
147 FROM stock AS plot
148 JOIN stock_relationship AS plot_acc_rel ON(plot_acc_rel.subject_id=plot.stock_id AND plot_acc_rel.type_id=$plot_of_cvterm_id)
149 JOIN stock AS accession ON(plot_acc_rel.object_id=accession.stock_id AND accession.type_id=$accession_cvterm_id)
150 LEFT JOIN stock_relationship AS female_parent_rel ON(accession.stock_id=female_parent_rel.object_id AND female_parent_rel.type_id=$female_parent_cvterm_id)
151 LEFT JOIN stock AS female_parent ON(female_parent_rel.subject_id = female_parent.stock_id AND female_parent.type_id=$accession_cvterm_id)
152 LEFT JOIN stock_relationship AS male_parent_rel ON(accession.stock_id=male_parent_rel.object_id AND male_parent_rel.type_id=$male_parent_cvterm_id)
153 LEFT JOIN stock AS male_parent ON(male_parent_rel.subject_id = male_parent.stock_id AND male_parent.type_id=$accession_cvterm_id)
154 WHERE plot.type_id=$plot_cvterm_id AND plot.stock_id IN ($plot_list_string);";
155 my $h = $schema->storage->dbh()->prepare($q);
156 $h->execute();
157 my @plot_stock_ids_found = ();
158 my @plot_accession_stock_ids_found = ();
159 my @plot_female_stock_ids_found = ();
160 my @plot_male_stock_ids_found = ();
161 while (my ($plot_stock_id, $accession_stock_id, $female_parent_stock_id, $male_parent_stock_id) = $h->fetchrow_array()) {
162 push @plot_stock_ids_found, $plot_stock_id;
163 push @plot_accession_stock_ids_found, $accession_stock_id;
164 push @plot_female_stock_ids_found, $female_parent_stock_id;
165 push @plot_male_stock_ids_found, $male_parent_stock_id;
168 @all_individual_accessions_stock_ids = @plot_accession_stock_ids_found;
169 @individuals_stock_ids = @plot_stock_ids_found;
171 # print STDERR Dumper \@plot_stock_ids_found;
172 # print STDERR Dumper \@plot_female_stock_ids_found;
173 # print STDERR Dumper \@plot_male_stock_ids_found;
175 for my $i (0..scalar(@plot_stock_ids_found)-1) {
176 my $female_stock_id = $plot_female_stock_ids_found[$i];
177 my $male_stock_id = $plot_male_stock_ids_found[$i];
178 my $plot_stock_id = $plot_stock_ids_found[$i];
180 my $val = {};
181 if ($female_stock_id) {
182 $seen_female_parents{$female_stock_id}++;
183 $val->{female_stock_id} = $female_stock_id;
185 if ($male_stock_id) {
186 $seen_male_parents{$male_stock_id}++;
187 $val->{male_stock_id} = $male_stock_id;
189 $parent_hash{$plot_stock_id} = $val;
192 elsif ($accession_list && scalar(@$accession_list)>0) {
193 print STDERR "COMPUTING ARM FROM PARENTS FOR ACCESSIONS\n";
194 my $accession_list_string = join ',', @$accession_list;
195 my $q = "SELECT accession.stock_id, female_parent.stock_id, male_parent.stock_id
196 FROM stock AS accession
197 LEFT JOIN stock_relationship AS female_parent_rel ON(accession.stock_id=female_parent_rel.object_id AND female_parent_rel.type_id=$female_parent_cvterm_id)
198 LEFT JOIN stock AS female_parent ON(female_parent_rel.subject_id = female_parent.stock_id AND female_parent.type_id=$accession_cvterm_id)
199 LEFT JOIN stock_relationship AS male_parent_rel ON(accession.stock_id=male_parent_rel.object_id AND male_parent_rel.type_id=$male_parent_cvterm_id)
200 LEFT JOIN stock AS male_parent ON(male_parent_rel.subject_id = male_parent.stock_id AND male_parent.type_id=$accession_cvterm_id)
201 WHERE accession.type_id=$accession_cvterm_id AND accession.stock_id IN ($accession_list_string);";
202 my $h = $schema->storage->dbh()->prepare($q);
203 $h->execute();
204 my @accession_stock_ids_found = ();
205 my @female_stock_ids_found = ();
206 my @male_stock_ids_found = ();
207 while (my ($accession_stock_id, $female_parent_stock_id, $male_parent_stock_id) = $h->fetchrow_array()) {
208 push @accession_stock_ids_found, $accession_stock_id;
209 push @female_stock_ids_found, $female_parent_stock_id;
210 push @male_stock_ids_found, $male_parent_stock_id;
213 # print STDERR Dumper \@accession_stock_ids_found;
214 # print STDERR Dumper \@female_stock_ids_found;
215 # print STDERR Dumper \@male_stock_ids_found;
217 @all_individual_accessions_stock_ids = @$accession_list;
218 @individuals_stock_ids = @$accession_list;
220 for my $i (0..scalar(@accession_stock_ids_found)-1) {
221 my $female_stock_id = $female_stock_ids_found[$i];
222 my $male_stock_id = $male_stock_ids_found[$i];
223 my $accession_stock_id = $accession_stock_ids_found[$i];
225 my $val = {};
226 if ($female_stock_id) {
227 $seen_female_parents{$female_stock_id}++;
228 $val->{female_stock_id} = $female_stock_id;
230 if ($male_stock_id) {
231 $seen_male_parents{$male_stock_id}++;
232 $val->{male_stock_id} = $male_stock_id;
234 $parent_hash{$accession_stock_id} = $val;
238 my @female_stock_ids = keys %seen_female_parents;
239 my @male_stock_ids = keys %seen_male_parents;
241 return (\%parent_hash, \@individuals_stock_ids, \@all_individual_accessions_stock_ids, \@female_stock_ids, \@male_stock_ids);
244 sub arm_cache_key {
245 my $self = shift;
246 my $datatype = shift;
248 #print STDERR Dumper($self->_get_dataref());
249 my $json = JSON->new();
250 #preserve order of hash keys to get same text
251 $json = $json->canonical();
252 my $sorted_accession_list = $self->accession_id_list() || [];
253 my @sorted_accession_list = sort @$sorted_accession_list;
254 my $accessions = $json->encode( \@sorted_accession_list );
255 my $plots = $json->encode( $self->plot_id_list() || [] );
256 my $key = md5_hex($accessions.$plots."_$datatype");
257 return $key;
260 sub download_arm {
261 my $self = shift;
262 my $return_type = shift || 'filehandle';
263 my $shared_cluster_dir_config = shift;
264 my $backend_config = shift;
265 my $cluster_host_config = shift;
266 my $web_cluster_queue_config = shift;
267 my $basepath_config = shift;
268 my $download_format = $self->download_format();
269 my $arm_tempfile = $self->arm_temp_file();
271 my $key = $self->arm_cache_key("download_arm_v01".$download_format);
272 $self->_cache_key($key);
273 $self->cache( Cache::File->new( cache_root => $self->cache_root() ));
275 my $return;
276 if ($self->cache()->exists($key)) {
277 if ($return_type eq 'filehandle') {
278 $return = $self->cache()->handle($key);
280 elsif ($return_type eq 'data') {
281 $return = $self->cache()->get($key);
284 else {
285 my ($parent_hash, $stock_ids, $all_accession_stock_ids, $female_stock_ids, $male_stock_ids) = $self->get_arm($shared_cluster_dir_config, $backend_config, $cluster_host_config, $web_cluster_queue_config, $basepath_config);
287 my @grm;
289 my $data = '';
290 if ($download_format eq 'matrix') {
291 my @header = ("stock_id");
292 foreach (@$stock_ids) {
293 push @header, "S".$_;
296 my $header_line = join "\t", @header;
297 $data = "$header_line\n";
299 foreach my $s (@$stock_ids) {
300 my @row = ("S".$s);
301 foreach my $c (@$stock_ids) {
302 my $s1_female_parent = $parent_hash->{$s}->{female_stock_id};
303 my $s1_male_parent = $parent_hash->{$s}->{male_stock_id};
304 my $s2_female_parent = $parent_hash->{$c}->{female_stock_id};
305 my $s2_male_parent = $parent_hash->{$c}->{male_stock_id};
306 my $rel = 0;
307 if ($s1_female_parent == $s2_female_parent) {
308 $rel += 1;
310 if ($s1_male_parent == $s2_male_parent) {
311 $rel += 1;
313 push @row, $rel/2;
315 my $line = join "\t", @row;
316 $data .= "$line\n";
319 $self->cache()->set($key, $data);
320 if ($return_type eq 'filehandle') {
321 $return = $self->cache()->handle($key);
323 elsif ($return_type eq 'data') {
324 $return = $data;
327 elsif ($download_format eq 'three_column') {
328 my %result_hash;
329 my $row_num = 0;
330 my %seen_stock_ids;
331 # print STDERR Dumper \@grm;
332 foreach my $s (@$stock_ids) {
333 foreach my $c (@$stock_ids) {
334 if (!exists($result_hash{$s}->{$c}) && !exists($result_hash{$c}->{$s})) {
335 my $s1_female_parent = $parent_hash->{$s}->{female_stock_id};
336 my $s1_male_parent = $parent_hash->{$s}->{male_stock_id};
337 my $s2_female_parent = $parent_hash->{$c}->{female_stock_id};
338 my $s2_male_parent = $parent_hash->{$c}->{male_stock_id};
339 my $rel = 0;
340 if ($s1_female_parent == $s2_female_parent) {
341 $rel += 1;
343 if ($s1_male_parent == $s2_male_parent) {
344 $rel += 1;
346 $result_hash{$s}->{$c} = $rel/2;
347 $seen_stock_ids{$s}++;
348 $seen_stock_ids{$c}++;
353 foreach my $r (sort keys %result_hash) {
354 foreach my $s (sort keys %{$result_hash{$r}}) {
355 my $val = $result_hash{$r}->{$s};
356 if (defined $val and length $val) {
357 $data .= "S$r\tS$s\t$val\n";
362 foreach my $a (@$all_accession_stock_ids) {
363 if (!exists($seen_stock_ids{$a})) {
364 $data .= "S$a\tS$a\t1\n";
368 $self->cache()->set($key, $data);
369 if ($return_type eq 'filehandle') {
370 $return = $self->cache()->handle($key);
372 elsif ($return_type eq 'data') {
373 $return = $data;
376 elsif ($download_format eq 'three_column_stock_id_integer') {
377 my %result_hash;
378 my $row_num = 0;
379 my %seen_stock_ids;
380 foreach my $s (@$stock_ids) {
381 foreach my $c (@$stock_ids) {
382 if (!exists($result_hash{$s}->{$c}) && !exists($result_hash{$c}->{$s})) {
383 my $s1_female_parent = $parent_hash->{$s}->{female_stock_id};
384 my $s1_male_parent = $parent_hash->{$s}->{male_stock_id};
385 my $s2_female_parent = $parent_hash->{$c}->{female_stock_id};
386 my $s2_male_parent = $parent_hash->{$c}->{male_stock_id};
387 my $rel = 0;
388 if ($s1_female_parent == $s2_female_parent) {
389 $rel += 1;
391 if ($s1_male_parent == $s2_male_parent) {
392 $rel += 1;
394 $result_hash{$s}->{$c} = $rel/2;
395 $seen_stock_ids{$s}++;
396 $seen_stock_ids{$c}++;
401 foreach my $r (sort keys %result_hash) {
402 foreach my $s (sort keys %{$result_hash{$r}}) {
403 my $val = $result_hash{$r}->{$s};
404 if (defined $val and length $val) {
405 $data .= "$r\t$s\t$val\n";
410 foreach my $a (@$all_accession_stock_ids) {
411 if (!exists($seen_stock_ids{$a})) {
412 $data .= "$a\t$a\t1\n";
416 $self->cache()->set($key, $data);
417 if ($return_type eq 'filehandle') {
418 $return = $self->cache()->handle($key);
420 elsif ($return_type eq 'data') {
421 $return = $data;
424 elsif ($download_format eq 'three_column_reciprocal') {
425 my %result_hash;
426 my $row_num = 0;
427 my %seen_stock_ids;
428 # print STDERR Dumper \@grm;
429 foreach my $s (@$stock_ids) {
430 foreach my $c (@$stock_ids) {
431 if (!exists($result_hash{$s}->{$c}) && !exists($result_hash{$c}->{$s})) {
432 my $s1_female_parent = $parent_hash->{$s}->{female_stock_id};
433 my $s1_male_parent = $parent_hash->{$s}->{male_stock_id};
434 my $s2_female_parent = $parent_hash->{$c}->{female_stock_id};
435 my $s2_male_parent = $parent_hash->{$c}->{male_stock_id};
436 my $rel = 0;
437 if ($s1_female_parent == $s2_female_parent) {
438 $rel += 1;
440 if ($s1_male_parent == $s2_male_parent) {
441 $rel += 1;
443 $result_hash{$s}->{$c} = $rel/2;
444 $seen_stock_ids{$s}++;
445 $seen_stock_ids{$c}++;
450 foreach my $r (sort keys %result_hash) {
451 foreach my $s (sort keys %{$result_hash{$r}}) {
452 my $val = $result_hash{$r}->{$s};
453 if (defined $val and length $val) {
454 $data .= "S$r\tS$s\t$val\n";
455 if ($s != $r) {
456 $data .= "S$s\tS$r\t$val\n";
462 foreach my $a (@$all_accession_stock_ids) {
463 if (!exists($seen_stock_ids{$a})) {
464 $data .= "S$a\tS$a\t1\n";
468 $self->cache()->set($key, $data);
469 if ($return_type eq 'filehandle') {
470 $return = $self->cache()->handle($key);
472 elsif ($return_type eq 'data') {
473 $return = $data;
476 elsif ($download_format eq 'three_column_reciprocal_stock_id_integer') {
477 my %result_hash;
478 my $row_num = 0;
479 my %seen_stock_ids;
480 # print STDERR Dumper \@grm;
481 foreach my $s (@$stock_ids) {
482 foreach my $c (@$stock_ids) {
483 if (!exists($result_hash{$s}->{$c}) && !exists($result_hash{$c}->{$s})) {
484 my $s1_female_parent = $parent_hash->{$s}->{female_stock_id};
485 my $s1_male_parent = $parent_hash->{$s}->{male_stock_id};
486 my $s2_female_parent = $parent_hash->{$c}->{female_stock_id};
487 my $s2_male_parent = $parent_hash->{$c}->{male_stock_id};
488 my $rel = 0;
489 if ($s1_female_parent == $s2_female_parent) {
490 $rel += 1;
492 if ($s1_male_parent == $s2_male_parent) {
493 $rel += 1;
495 $result_hash{$s}->{$c} = $rel/2;
496 $seen_stock_ids{$s}++;
497 $seen_stock_ids{$c}++;
502 foreach my $r (sort keys %result_hash) {
503 foreach my $s (sort keys %{$result_hash{$r}}) {
504 my $val = $result_hash{$r}->{$s};
505 if (defined $val and length $val) {
506 $data .= "$r\t$s\t$val\n";
507 if ($s != $r) {
508 $data .= "$s\t$r\t$val\n";
514 foreach my $a (@$all_accession_stock_ids) {
515 if (!exists($seen_stock_ids{$a})) {
516 $data .= "$a\t$a\t1\n";
520 $self->cache()->set($key, $data);
521 if ($return_type eq 'filehandle') {
522 $return = $self->cache()->handle($key);
524 elsif ($return_type eq 'data') {
525 $return = $data;
528 elsif ($download_format eq 'heatmap') {
529 my %result_hash;
530 my $row_num = 0;
531 my %seen_stock_ids;
532 foreach my $s (@$stock_ids) {
533 my @row = ($s);
534 foreach my $c (@$stock_ids) {
535 if (!exists($result_hash{$s}->{$c}) && !exists($result_hash{$c}->{$s})) {
536 my $s1_female_parent = $parent_hash->{$s}->{female_stock_id};
537 my $s1_male_parent = $parent_hash->{$s}->{male_stock_id};
538 my $s2_female_parent = $parent_hash->{$c}->{female_stock_id};
539 my $s2_male_parent = $parent_hash->{$c}->{male_stock_id};
540 my $rel = 0;
541 if ($s1_female_parent == $s2_female_parent) {
542 $rel += 1;
544 if ($s1_male_parent == $s2_male_parent) {
545 $rel += 1;
547 $result_hash{$s}->{$c} = $rel/2;
548 $seen_stock_ids{$s}++;
549 $seen_stock_ids{$c}++;
554 foreach my $r (sort keys %result_hash) {
555 foreach my $s (sort keys %{$result_hash{$r}}) {
556 my $val = $result_hash{$r}->{$s};
557 $data .= "$r\t$s\t$val\n";
561 foreach my $a (@$all_accession_stock_ids) {
562 if (!exists($seen_stock_ids{$a})) {
563 $data .= "$a\t$a\t1\n";
567 open(my $heatmap_fh, '>', $arm_tempfile) or die $!;
568 print $heatmap_fh $data;
569 close($heatmap_fh);
571 my $arm_tempfile_out = $arm_tempfile . "_plot_out";
572 my $heatmap_cmd = 'R -e "library(ggplot2); library(data.table);
573 mat <- fread(\''.$arm_tempfile.'\', header=FALSE, sep=\'\t\', stringsAsFactors=FALSE);
574 pdf( \''.$arm_tempfile_out.'\', width = 8.5, height = 11);
575 ggplot(data = mat, aes(x=V1, y=V2, fill=V3)) + geom_tile();
576 dev.off();
578 print STDERR Dumper $heatmap_cmd;
580 my $tmp_output_dir = $shared_cluster_dir_config."/tmp_download_arm_heatmap";
581 mkdir $tmp_output_dir if ! -d $tmp_output_dir;
583 # Do the GRM on the cluster
584 my $plot_cmd = CXGN::Tools::Run->new(
586 backend => $backend_config,
587 submit_host => $cluster_host_config,
588 temp_base => $tmp_output_dir,
589 queue => $web_cluster_queue_config,
590 do_cleanup => 0,
591 out_file => $arm_tempfile_out,
592 # don't block and wait if the cluster looks full
593 max_cluster_jobs => 1_000_000_000,
597 $plot_cmd->run_cluster($heatmap_cmd);
598 $plot_cmd->is_cluster(1);
599 $plot_cmd->wait;
601 if ($return_type eq 'filehandle') {
602 open my $out_copy, '<', $arm_tempfile_out or die "Can't open output file: $!";
604 $self->cache()->set($key, '');
605 my $file_handle = $self->cache()->handle($key);
606 copy($out_copy, $file_handle);
608 close $out_copy;
609 $return = $self->cache()->handle($key);
611 elsif ($return_type eq 'data') {
612 die "Can only return the filehandle for ARM heatmap!\n";
616 return $return;