1 package CXGN
::Pedigree
::ARM
;
5 CXGN::Genotype::GRM - an object to handle fetching an additive relationship matrix (ARM) from pedigrees for stocks
9 my $pedigree_arm = CXGN::Pedigree::ARM->new({
11 arm_temp_file=>$file_temp_path,
12 people_schema=>$people_schema,
13 accession_id_list=>\@accession_list,
14 plot_id_list=>\@plot_id_list,
15 cache_root=>$cache_root,
16 download_format=>'matrix', #either 'matrix', 'three_column', or 'heatmap'
19 $pedigree_arm->download_arm();
23 my $arm = $pedigree_arm->get_arm();
30 Nicolas Morales <nm529@cornell.edu>
39 use SGN
::Model
::Cvterm
;
42 use CXGN
::Stock
::Accession
;
43 use CXGN
::Genotype
::Protocol
;
44 use CXGN
::Genotype
::Search
;
45 use CXGN
::Genotype
::ComputeHybridGenotype
;
47 use R
::YapRI
::Data
::Matrix
;
48 use CXGN
::Dataset
::Cache
;
50 use Digest
::MD5 qw
| md5_hex
|;
51 use File
::Slurp qw
| write_file
|;
55 use File
::Temp
'tempfile';
58 isa
=> 'Bio::Chado::Schema',
63 has
'people_schema' => (
64 isa
=> 'CXGN::People::Schema',
69 # Uses a cached file system for getting genotype results and getting GRM
76 has
'download_format' => (
87 has
'cache_expiry' => (
90 default => 0, # never expires?
98 has
'arm_temp_file' => (
104 has
'accession_id_list' => (
105 isa
=> 'ArrayRef[Int]|Undef',
109 has
'plot_id_list' => (
110 isa
=> 'ArrayRef[Int]|Undef',
116 my $shared_cluster_dir_config = shift;
117 my $backend_config = shift;
118 my $cluster_host_config = shift;
119 my $web_cluster_queue_config = shift;
120 my $basepath_config = shift;
121 my $schema = $self->bcs_schema();
122 my $people_schema = $self->people_schema();
123 my $cache_root_dir = $self->cache_root();
124 my $accession_list = $self->accession_id_list();
125 my $plot_list = $self->plot_id_list();
127 my $accession_cvterm_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'accession', 'stock_type')->cvterm_id();
128 my $plot_cvterm_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'plot', 'stock_type')->cvterm_id();
129 my $plot_of_cvterm_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'plot_of', 'stock_relationship')->cvterm_id();
130 my $female_parent_cvterm_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'female_parent', 'stock_relationship')->cvterm_id();
131 my $male_parent_cvterm_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'male_parent', 'stock_relationship')->cvterm_id();
133 my $number_system_cores = `getconf _NPROCESSORS_ONLN` or die "Could not get number of system cores!\n";
134 chomp($number_system_cores);
135 print STDERR
"NUMCORES $number_system_cores\n";
137 my @individuals_stock_ids;
138 my @all_individual_accessions_stock_ids;
140 my %seen_female_parents;
141 my %seen_male_parents;
143 if ($plot_list && scalar(@
$plot_list)>0) {
144 print STDERR
"COMPUTING ARM FROM PARENTS FOR PLOTS\n";
145 my $plot_list_string = join ',', @
$plot_list;
146 my $q = "SELECT plot.stock_id, accession.stock_id, female_parent.stock_id, male_parent.stock_id
148 JOIN stock_relationship AS plot_acc_rel ON(plot_acc_rel.subject_id=plot.stock_id AND plot_acc_rel.type_id=$plot_of_cvterm_id)
149 JOIN stock AS accession ON(plot_acc_rel.object_id=accession.stock_id AND accession.type_id=$accession_cvterm_id)
150 LEFT JOIN stock_relationship AS female_parent_rel ON(accession.stock_id=female_parent_rel.object_id AND female_parent_rel.type_id=$female_parent_cvterm_id)
151 LEFT JOIN stock AS female_parent ON(female_parent_rel.subject_id = female_parent.stock_id AND female_parent.type_id=$accession_cvterm_id)
152 LEFT JOIN stock_relationship AS male_parent_rel ON(accession.stock_id=male_parent_rel.object_id AND male_parent_rel.type_id=$male_parent_cvterm_id)
153 LEFT JOIN stock AS male_parent ON(male_parent_rel.subject_id = male_parent.stock_id AND male_parent.type_id=$accession_cvterm_id)
154 WHERE plot.type_id=$plot_cvterm_id AND plot.stock_id IN ($plot_list_string);";
155 my $h = $schema->storage->dbh()->prepare($q);
157 my @plot_stock_ids_found = ();
158 my @plot_accession_stock_ids_found = ();
159 my @plot_female_stock_ids_found = ();
160 my @plot_male_stock_ids_found = ();
161 while (my ($plot_stock_id, $accession_stock_id, $female_parent_stock_id, $male_parent_stock_id) = $h->fetchrow_array()) {
162 push @plot_stock_ids_found, $plot_stock_id;
163 push @plot_accession_stock_ids_found, $accession_stock_id;
164 push @plot_female_stock_ids_found, $female_parent_stock_id;
165 push @plot_male_stock_ids_found, $male_parent_stock_id;
168 @all_individual_accessions_stock_ids = @plot_accession_stock_ids_found;
169 @individuals_stock_ids = @plot_stock_ids_found;
171 # print STDERR Dumper \@plot_stock_ids_found;
172 # print STDERR Dumper \@plot_female_stock_ids_found;
173 # print STDERR Dumper \@plot_male_stock_ids_found;
175 for my $i (0..scalar(@plot_stock_ids_found)-1) {
176 my $female_stock_id = $plot_female_stock_ids_found[$i];
177 my $male_stock_id = $plot_male_stock_ids_found[$i];
178 my $plot_stock_id = $plot_stock_ids_found[$i];
181 if ($female_stock_id) {
182 $seen_female_parents{$female_stock_id}++;
183 $val->{female_stock_id
} = $female_stock_id;
185 if ($male_stock_id) {
186 $seen_male_parents{$male_stock_id}++;
187 $val->{male_stock_id
} = $male_stock_id;
189 $parent_hash{$plot_stock_id} = $val;
192 elsif ($accession_list && scalar(@
$accession_list)>0) {
193 print STDERR
"COMPUTING ARM FROM PARENTS FOR ACCESSIONS\n";
194 my $accession_list_string = join ',', @
$accession_list;
195 my $q = "SELECT accession.stock_id, female_parent.stock_id, male_parent.stock_id
196 FROM stock AS accession
197 LEFT JOIN stock_relationship AS female_parent_rel ON(accession.stock_id=female_parent_rel.object_id AND female_parent_rel.type_id=$female_parent_cvterm_id)
198 LEFT JOIN stock AS female_parent ON(female_parent_rel.subject_id = female_parent.stock_id AND female_parent.type_id=$accession_cvterm_id)
199 LEFT JOIN stock_relationship AS male_parent_rel ON(accession.stock_id=male_parent_rel.object_id AND male_parent_rel.type_id=$male_parent_cvterm_id)
200 LEFT JOIN stock AS male_parent ON(male_parent_rel.subject_id = male_parent.stock_id AND male_parent.type_id=$accession_cvterm_id)
201 WHERE accession.type_id=$accession_cvterm_id AND accession.stock_id IN ($accession_list_string);";
202 my $h = $schema->storage->dbh()->prepare($q);
204 my @accession_stock_ids_found = ();
205 my @female_stock_ids_found = ();
206 my @male_stock_ids_found = ();
207 while (my ($accession_stock_id, $female_parent_stock_id, $male_parent_stock_id) = $h->fetchrow_array()) {
208 push @accession_stock_ids_found, $accession_stock_id;
209 push @female_stock_ids_found, $female_parent_stock_id;
210 push @male_stock_ids_found, $male_parent_stock_id;
213 # print STDERR Dumper \@accession_stock_ids_found;
214 # print STDERR Dumper \@female_stock_ids_found;
215 # print STDERR Dumper \@male_stock_ids_found;
217 @all_individual_accessions_stock_ids = @
$accession_list;
218 @individuals_stock_ids = @
$accession_list;
220 for my $i (0..scalar(@accession_stock_ids_found)-1) {
221 my $female_stock_id = $female_stock_ids_found[$i];
222 my $male_stock_id = $male_stock_ids_found[$i];
223 my $accession_stock_id = $accession_stock_ids_found[$i];
226 if ($female_stock_id) {
227 $seen_female_parents{$female_stock_id}++;
228 $val->{female_stock_id
} = $female_stock_id;
230 if ($male_stock_id) {
231 $seen_male_parents{$male_stock_id}++;
232 $val->{male_stock_id
} = $male_stock_id;
234 $parent_hash{$accession_stock_id} = $val;
238 my @female_stock_ids = keys %seen_female_parents;
239 my @male_stock_ids = keys %seen_male_parents;
241 return (\
%parent_hash, \
@individuals_stock_ids, \
@all_individual_accessions_stock_ids, \
@female_stock_ids, \
@male_stock_ids);
246 my $datatype = shift;
248 #print STDERR Dumper($self->_get_dataref());
249 my $json = JSON
->new();
250 #preserve order of hash keys to get same text
251 $json = $json->canonical();
252 my $sorted_accession_list = $self->accession_id_list() || [];
253 my @sorted_accession_list = sort @
$sorted_accession_list;
254 my $accessions = $json->encode( \
@sorted_accession_list );
255 my $plots = $json->encode( $self->plot_id_list() || [] );
256 my $key = md5_hex
($accessions.$plots."_$datatype");
262 my $return_type = shift || 'filehandle';
263 my $shared_cluster_dir_config = shift;
264 my $backend_config = shift;
265 my $cluster_host_config = shift;
266 my $web_cluster_queue_config = shift;
267 my $basepath_config = shift;
268 my $download_format = $self->download_format();
269 my $arm_tempfile = $self->arm_temp_file();
271 my $key = $self->arm_cache_key("download_arm_v01".$download_format);
272 $self->_cache_key($key);
273 $self->cache( Cache
::File
->new( cache_root
=> $self->cache_root() ));
276 if ($self->cache()->exists($key)) {
277 if ($return_type eq 'filehandle') {
278 $return = $self->cache()->handle($key);
280 elsif ($return_type eq 'data') {
281 $return = $self->cache()->get($key);
285 my ($parent_hash, $stock_ids, $all_accession_stock_ids, $female_stock_ids, $male_stock_ids) = $self->get_arm($shared_cluster_dir_config, $backend_config, $cluster_host_config, $web_cluster_queue_config, $basepath_config);
290 if ($download_format eq 'matrix') {
291 my @header = ("stock_id");
292 foreach (@
$stock_ids) {
293 push @header, "S".$_;
296 my $header_line = join "\t", @header;
297 $data = "$header_line\n";
299 foreach my $s (@
$stock_ids) {
301 foreach my $c (@
$stock_ids) {
302 my $s1_female_parent = $parent_hash->{$s}->{female_stock_id
};
303 my $s1_male_parent = $parent_hash->{$s}->{male_stock_id
};
304 my $s2_female_parent = $parent_hash->{$c}->{female_stock_id
};
305 my $s2_male_parent = $parent_hash->{$c}->{male_stock_id
};
307 if ($s1_female_parent == $s2_female_parent) {
310 if ($s1_male_parent == $s2_male_parent) {
315 my $line = join "\t", @row;
319 $self->cache()->set($key, $data);
320 if ($return_type eq 'filehandle') {
321 $return = $self->cache()->handle($key);
323 elsif ($return_type eq 'data') {
327 elsif ($download_format eq 'three_column') {
331 # print STDERR Dumper \@grm;
332 foreach my $s (@
$stock_ids) {
333 foreach my $c (@
$stock_ids) {
334 if (!exists($result_hash{$s}->{$c}) && !exists($result_hash{$c}->{$s})) {
335 my $s1_female_parent = $parent_hash->{$s}->{female_stock_id
};
336 my $s1_male_parent = $parent_hash->{$s}->{male_stock_id
};
337 my $s2_female_parent = $parent_hash->{$c}->{female_stock_id
};
338 my $s2_male_parent = $parent_hash->{$c}->{male_stock_id
};
340 if ($s1_female_parent == $s2_female_parent) {
343 if ($s1_male_parent == $s2_male_parent) {
346 $result_hash{$s}->{$c} = $rel/2;
347 $seen_stock_ids{$s}++;
348 $seen_stock_ids{$c}++;
353 foreach my $r (sort keys %result_hash) {
354 foreach my $s (sort keys %{$result_hash{$r}}) {
355 my $val = $result_hash{$r}->{$s};
356 if (defined $val and length $val) {
357 $data .= "S$r\tS$s\t$val\n";
362 foreach my $a (@
$all_accession_stock_ids) {
363 if (!exists($seen_stock_ids{$a})) {
364 $data .= "S$a\tS$a\t1\n";
368 $self->cache()->set($key, $data);
369 if ($return_type eq 'filehandle') {
370 $return = $self->cache()->handle($key);
372 elsif ($return_type eq 'data') {
376 elsif ($download_format eq 'three_column_stock_id_integer') {
380 foreach my $s (@
$stock_ids) {
381 foreach my $c (@
$stock_ids) {
382 if (!exists($result_hash{$s}->{$c}) && !exists($result_hash{$c}->{$s})) {
383 my $s1_female_parent = $parent_hash->{$s}->{female_stock_id
};
384 my $s1_male_parent = $parent_hash->{$s}->{male_stock_id
};
385 my $s2_female_parent = $parent_hash->{$c}->{female_stock_id
};
386 my $s2_male_parent = $parent_hash->{$c}->{male_stock_id
};
388 if ($s1_female_parent == $s2_female_parent) {
391 if ($s1_male_parent == $s2_male_parent) {
394 $result_hash{$s}->{$c} = $rel/2;
395 $seen_stock_ids{$s}++;
396 $seen_stock_ids{$c}++;
401 foreach my $r (sort keys %result_hash) {
402 foreach my $s (sort keys %{$result_hash{$r}}) {
403 my $val = $result_hash{$r}->{$s};
404 if (defined $val and length $val) {
405 $data .= "$r\t$s\t$val\n";
410 foreach my $a (@
$all_accession_stock_ids) {
411 if (!exists($seen_stock_ids{$a})) {
412 $data .= "$a\t$a\t1\n";
416 $self->cache()->set($key, $data);
417 if ($return_type eq 'filehandle') {
418 $return = $self->cache()->handle($key);
420 elsif ($return_type eq 'data') {
424 elsif ($download_format eq 'three_column_reciprocal') {
428 # print STDERR Dumper \@grm;
429 foreach my $s (@
$stock_ids) {
430 foreach my $c (@
$stock_ids) {
431 if (!exists($result_hash{$s}->{$c}) && !exists($result_hash{$c}->{$s})) {
432 my $s1_female_parent = $parent_hash->{$s}->{female_stock_id
};
433 my $s1_male_parent = $parent_hash->{$s}->{male_stock_id
};
434 my $s2_female_parent = $parent_hash->{$c}->{female_stock_id
};
435 my $s2_male_parent = $parent_hash->{$c}->{male_stock_id
};
437 if ($s1_female_parent == $s2_female_parent) {
440 if ($s1_male_parent == $s2_male_parent) {
443 $result_hash{$s}->{$c} = $rel/2;
444 $seen_stock_ids{$s}++;
445 $seen_stock_ids{$c}++;
450 foreach my $r (sort keys %result_hash) {
451 foreach my $s (sort keys %{$result_hash{$r}}) {
452 my $val = $result_hash{$r}->{$s};
453 if (defined $val and length $val) {
454 $data .= "S$r\tS$s\t$val\n";
456 $data .= "S$s\tS$r\t$val\n";
462 foreach my $a (@
$all_accession_stock_ids) {
463 if (!exists($seen_stock_ids{$a})) {
464 $data .= "S$a\tS$a\t1\n";
468 $self->cache()->set($key, $data);
469 if ($return_type eq 'filehandle') {
470 $return = $self->cache()->handle($key);
472 elsif ($return_type eq 'data') {
476 elsif ($download_format eq 'three_column_reciprocal_stock_id_integer') {
480 # print STDERR Dumper \@grm;
481 foreach my $s (@
$stock_ids) {
482 foreach my $c (@
$stock_ids) {
483 if (!exists($result_hash{$s}->{$c}) && !exists($result_hash{$c}->{$s})) {
484 my $s1_female_parent = $parent_hash->{$s}->{female_stock_id
};
485 my $s1_male_parent = $parent_hash->{$s}->{male_stock_id
};
486 my $s2_female_parent = $parent_hash->{$c}->{female_stock_id
};
487 my $s2_male_parent = $parent_hash->{$c}->{male_stock_id
};
489 if ($s1_female_parent == $s2_female_parent) {
492 if ($s1_male_parent == $s2_male_parent) {
495 $result_hash{$s}->{$c} = $rel/2;
496 $seen_stock_ids{$s}++;
497 $seen_stock_ids{$c}++;
502 foreach my $r (sort keys %result_hash) {
503 foreach my $s (sort keys %{$result_hash{$r}}) {
504 my $val = $result_hash{$r}->{$s};
505 if (defined $val and length $val) {
506 $data .= "$r\t$s\t$val\n";
508 $data .= "$s\t$r\t$val\n";
514 foreach my $a (@
$all_accession_stock_ids) {
515 if (!exists($seen_stock_ids{$a})) {
516 $data .= "$a\t$a\t1\n";
520 $self->cache()->set($key, $data);
521 if ($return_type eq 'filehandle') {
522 $return = $self->cache()->handle($key);
524 elsif ($return_type eq 'data') {
528 elsif ($download_format eq 'heatmap') {
532 foreach my $s (@
$stock_ids) {
534 foreach my $c (@
$stock_ids) {
535 if (!exists($result_hash{$s}->{$c}) && !exists($result_hash{$c}->{$s})) {
536 my $s1_female_parent = $parent_hash->{$s}->{female_stock_id
};
537 my $s1_male_parent = $parent_hash->{$s}->{male_stock_id
};
538 my $s2_female_parent = $parent_hash->{$c}->{female_stock_id
};
539 my $s2_male_parent = $parent_hash->{$c}->{male_stock_id
};
541 if ($s1_female_parent == $s2_female_parent) {
544 if ($s1_male_parent == $s2_male_parent) {
547 $result_hash{$s}->{$c} = $rel/2;
548 $seen_stock_ids{$s}++;
549 $seen_stock_ids{$c}++;
554 foreach my $r (sort keys %result_hash) {
555 foreach my $s (sort keys %{$result_hash{$r}}) {
556 my $val = $result_hash{$r}->{$s};
557 $data .= "$r\t$s\t$val\n";
561 foreach my $a (@
$all_accession_stock_ids) {
562 if (!exists($seen_stock_ids{$a})) {
563 $data .= "$a\t$a\t1\n";
567 open(my $heatmap_fh, '>', $arm_tempfile) or die $!;
568 print $heatmap_fh $data;
571 my $arm_tempfile_out = $arm_tempfile . "_plot_out";
572 my $heatmap_cmd = 'R -e "library(ggplot2); library(data.table);
573 mat <- fread(\''.$arm_tempfile.'\', header=FALSE, sep=\'\t\', stringsAsFactors=FALSE);
574 pdf( \''.$arm_tempfile_out.'\', width = 8.5, height = 11);
575 ggplot(data = mat, aes(x=V1, y=V2, fill=V3)) + geom_tile();
578 print STDERR Dumper
$heatmap_cmd;
580 my $tmp_output_dir = $shared_cluster_dir_config."/tmp_download_arm_heatmap";
581 mkdir $tmp_output_dir if ! -d
$tmp_output_dir;
583 # Do the GRM on the cluster
584 my $plot_cmd = CXGN
::Tools
::Run
->new(
586 backend
=> $backend_config,
587 submit_host
=> $cluster_host_config,
588 temp_base
=> $tmp_output_dir,
589 queue
=> $web_cluster_queue_config,
591 out_file
=> $arm_tempfile_out,
592 # don't block and wait if the cluster looks full
593 max_cluster_jobs
=> 1_000_000_000
,
597 $plot_cmd->run_cluster($heatmap_cmd);
598 $plot_cmd->is_cluster(1);
601 if ($return_type eq 'filehandle') {
602 open my $out_copy, '<', $arm_tempfile_out or die "Can't open output file: $!";
604 $self->cache()->set($key, '');
605 my $file_handle = $self->cache()->handle($key);
606 copy
($out_copy, $file_handle);
609 $return = $self->cache()->handle($key);
611 elsif ($return_type eq 'data') {
612 die "Can only return the filehandle for ARM heatmap!\n";