1 package CXGN
::Genotype
::GWAS
;
5 CXGN::Genotype::GWAS - an object to handle GWAS
9 my $geno = CXGN::Genotype::GWAS->new({
11 grm_temp_file=>$file_temp_path,
12 gwas_temp_file=>$file_temp_path_gwas,
13 pheno_temp_file=>$file_temp_path_pheno,
14 people_schema=>$people_schema,
15 download_format=>$download_format, #either 'results_tsv' or 'manhattan_qq_plots'
16 accession_id_list=>\@accession_list,
17 trait_id_list=>\@trait_id_list,
18 traits_are_repeated_measurements=>$traits_are_repeated_measurements,
19 protocol_id=>$protocol_id,
20 get_grm_for_parental_accessions=>1,
21 cache_root=>$cache_root,
22 minor_allele_frequency=>0.01,
24 individuals_filter=>0.8
26 $geno->download_gwas();
33 Nicolas Morales <nm529@cornell.edu>
42 use SGN
::Model
::Cvterm
;
45 use CXGN
::Stock
::Accession
;
46 use CXGN
::Genotype
::Protocol
;
47 use CXGN
::Genotype
::Search
;
48 use CXGN
::Genotype
::ComputeHybridGenotype
;
49 use CXGN
::Phenotypes
::SearchFactory
;
52 use R
::YapRI
::Data
::Matrix
;
53 use CXGN
::Dataset
::Cache
;
55 use Digest
::MD5 qw
| md5_hex
|;
56 use File
::Slurp qw
| write_file
|;
57 use File
::Temp
'tempfile';
62 isa
=> 'Bio::Chado::Schema',
67 has
'people_schema' => (
68 isa
=> 'CXGN::People::Schema',
73 # Uses a cached file system for getting genotype results and getting GRM
85 has
'cache_expiry' => (
88 default => 0, # never expires?
96 has
'grm_temp_file' => (
102 has
'gwas_temp_file' => (
108 has
'pheno_temp_file' => (
114 has
'protocol_id' => (
120 has
'download_format' => (
126 has
'minor_allele_frequency' => (
132 has
'marker_filter' => (
138 has
'individuals_filter' => (
144 has
'accession_id_list' => (
145 isa
=> 'ArrayRef[Int]|Undef',
149 has
'trait_id_list' => (
150 isa
=> 'ArrayRef[Int]|Undef',
154 has
'traits_are_repeated_measurements' => (
160 # If the accessions in the plots you are interested have not been genotyped (as in hybrids), can get this boolean to 1 and give a list of plot_id_list and you will get back a GRM built from the parent accessions for those plots (for the plots whose parents were genotyped)
161 has
'get_grm_for_parental_accessions' => (
167 has
'genotypeprop_hash_select' => (
168 isa
=> 'ArrayRef[Str]',
170 default => sub {['DS']} #THESE ARE THE GENERIC AND EXPECTED VCF ATRRIBUTES. For dosage matrix we only need DS
173 has
'protocolprop_top_key_select' => (
174 isa
=> 'ArrayRef[Str]',
176 default => sub {['markers']} #THESE ARE ALL POSSIBLE TOP LEVEL KEYS IN PROTOCOLPROP BASED ON VCF LOADING. For dosage matrix we only need markers
179 has
'protocolprop_marker_hash_select' => (
180 isa
=> 'ArrayRef[Str]',
182 default => sub {['name']} #THESE ARE ALL POSSIBLE PROTOCOLPROP MARKER HASH KEYS BASED ON VCF LOADING. For dosage matrix we only need name
185 has
'return_only_first_genotypeprop_for_stock' => (
193 my $shared_cluster_dir_config = shift;
194 my $backend_config = shift;
195 my $cluster_host_config = shift;
196 my $web_cluster_queue_config = shift;
197 my $basepath_config = shift;
198 my $schema = $self->bcs_schema();
199 my $people_schema = $self->people_schema();
200 my $cache_root_dir = $self->cache_root();
201 my $accession_list = $self->accession_id_list();
202 my $trait_list = $self->trait_id_list();
203 my $protocol_id = $self->protocol_id();
204 my $get_grm_for_parental_accessions = $self->get_grm_for_parental_accessions();
205 my $grm_tempfile = $self->grm_temp_file();
206 my $gwas_tempfile = $self->gwas_temp_file();
207 my $pheno_tempfile = $self->pheno_temp_file();
208 my $download_format = $self->download_format();
209 my $traits_are_repeated_measurements = $self->traits_are_repeated_measurements();
211 my $accession_cvterm_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'accession', 'stock_type')->cvterm_id();
212 my $plot_cvterm_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'plot', 'stock_type')->cvterm_id();
213 my $plot_of_cvterm_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'plot_of', 'stock_relationship')->cvterm_id();
214 my $female_parent_cvterm_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'female_parent', 'stock_relationship')->cvterm_id();
215 my $male_parent_cvterm_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'male_parent', 'stock_relationship')->cvterm_id();
217 my $number_system_cores = `getconf _NPROCESSORS_ONLN` or die "Could not get number of system cores!\n";
218 chomp($number_system_cores);
219 print STDERR
"NUMCORES $number_system_cores\n";
221 my $phenotypes_search = CXGN
::Phenotypes
::SearchFactory
->instantiate(
222 'MaterializedViewTable',
226 trait_list
=>$trait_list,
227 accession_list
=>$accession_list,
228 exclude_phenotype_outlier
=>0,
232 my ($data, $unique_traits) = $phenotypes_search->search();
233 my %unique_trait_ids;
234 my %unique_observation_units;
235 my %phenotype_data_hash;
236 my %filtered_accession_ids;
237 foreach my $d (@
$data) {
238 $unique_observation_units{$d->{observationunit_stock_id
}} = $d;
239 foreach my $o (@
{$d->{observations
}}) {
241 if ($traits_are_repeated_measurements) {
242 $trait_id_use = '0001';
245 $trait_id_use = $o->{trait_id
};
247 $unique_trait_ids{$trait_id_use}++;
248 if ($o->{value
} || $o->{value
} == 0) {
249 $phenotype_data_hash{$d->{observationunit_stock_id
}}->{$trait_id_use} = $o->{value
};
250 $filtered_accession_ids{$d->{germplasm_stock_id
}}++;
254 my @unique_stock_ids_sorted = sort keys %unique_observation_units;
255 my @unique_trait_ids_sorted = sort keys %unique_trait_ids;
256 my @unique_accession_ids_sorted = sort keys %filtered_accession_ids;
257 $accession_list = \
@unique_accession_ids_sorted;
259 my $trait_string_sql = join ',', @unique_trait_ids_sorted;
260 open(my $F_pheno, ">", $pheno_tempfile) || die "Can't open file ".$pheno_tempfile;
261 print $F_pheno 'gid,field_trial_id,replicate,'.$trait_string_sql."\n";
262 foreach my $stock_id (@unique_stock_ids_sorted) {
263 my $d = $unique_observation_units{$stock_id};
264 print $F_pheno $d->{germplasm_stock_id
}.','.$d->{trial_id
}.','.$d->{obsunit_rep
};
265 foreach my $t (@unique_trait_ids_sorted) {
266 my $pheno_val = $phenotype_data_hash{$stock_id}->{$t} || '';
267 print $F_pheno ','.$pheno_val;
273 my $protocol = CXGN
::Genotype
::Protocol
->new({
274 bcs_schema
=> $schema,
275 nd_protocol_id
=> $protocol_id
277 my $markers = $protocol->markers;
278 my @all_marker_objects = values %$markers;
279 if (scalar(@all_marker_objects) > 10000) {
280 my $page = CXGN
::Page
->new();
281 $page->message_page('GWAS Error', 'Please choose less than 10,000 markers');
282 print STDERR
"Error: for GWAS please choose less than 10,000 markers";
285 no warnings
'uninitialized';
286 @all_marker_objects = sort { $a->{chrom
} cmp $b->{chrom
} || $a->{pos} <=> $b->{pos} || $a->{name
} cmp $b->{name
} } @all_marker_objects;
288 my @individuals_stock_ids;
289 my @all_individual_accessions_stock_ids;
292 # In this case a list of accessions is given, so get a GRM between these accessions
293 if ($accession_list && scalar(@
$accession_list)>0 && !$get_grm_for_parental_accessions){
294 @all_individual_accessions_stock_ids = @
$accession_list;
295 if (scalar(@all_individual_accessions_stock_ids) > 100) {
296 my $page = CXGN
::Page
->new();
297 $page->message_page('GWAS Error', 'Please choose less than 100 accessions');
298 print STDERR
"Error: for GWAS please choose less than 100 accessions";
300 foreach (@
$accession_list) {
301 my $dataset = CXGN
::Dataset
::Cache
->new({
302 people_schema
=>$people_schema,
304 cache_root
=>$cache_root_dir,
307 my $genotypes = $dataset->retrieve_genotypes($protocol_id, ['DS'], ['markers'], ['name','chrom','pos'], 1, [], undef, undef, []);
309 if (scalar(@
$genotypes)>0) {
311 # For old genotyping protocols without nd_protocolprop info...
312 if (scalar(@all_marker_objects) == 0) {
313 my $position_placeholder = 1;
314 foreach my $o (sort genosort
keys %{$genotypes->[0]->{selected_genotype_hash
}}) {
315 push @all_marker_objects, {name
=> $o, chrom
=> '1', pos => $position_placeholder};
316 $position_placeholder++;
320 foreach my $p (0..scalar(@
$genotypes)-1) {
321 my $geno = $genotypes->[$p];
323 my $genotype_string = "";
325 $genotype_string .= "ID\t";
326 foreach my $m (@all_marker_objects) {
327 $genotype_string .= $m->{name
} . "\t";
329 $genotype_string .= "\n";
330 $genotype_string .= "CHROM\t";
331 foreach my $m (@all_marker_objects) {
332 $genotype_string .= $geno->{selected_protocol_hash
}->{markers
}->{$m->{name
}}->{chrom
} . " \t";
334 $genotype_string .= "\n";
335 $genotype_string .= "POS\t";
336 foreach my $m (@all_marker_objects) {
337 $genotype_string .= $geno->{selected_protocol_hash
}->{markers
}->{$m->{name
}}->{pos} . " \t";
339 $genotype_string .= "\n";
341 my $genotype_id = $geno->{stock_id
};
342 my $genotype_data_string = "";
343 foreach my $m (@all_marker_objects) {
344 $genotype_data_string .= $geno->{selected_genotype_hash
}->{$m->{name
}}->{'DS'}."\t";
346 $genotype_string .= $genotype_id."\t".$genotype_data_string."\n";
348 push @individuals_stock_ids, $genotype_id;
349 write_file
($grm_tempfile, {append
=> 1}, $genotype_string);
350 undef $genotypes->[$p];
357 # IN this case of a hybrid evaluation where the parents of the accessions planted in a plot are genotyped
358 elsif ($get_grm_for_parental_accessions && $accession_list && scalar(@
$accession_list)>0) {
359 print STDERR
"COMPUTING GENOTYPE FROM PARENTS FOR ACCESSIONS\n";
360 my $accession_list_string = join ',', @
$accession_list;
361 my $q = "SELECT accession.stock_id, female_parent.stock_id, male_parent.stock_id
362 FROM stock AS accession
363 JOIN stock_relationship AS female_parent_rel ON(accession.stock_id=female_parent_rel.object_id AND female_parent_rel.type_id=$female_parent_cvterm_id)
364 JOIN stock AS female_parent ON(female_parent_rel.subject_id = female_parent.stock_id AND female_parent.type_id=$accession_cvterm_id)
365 JOIN stock_relationship AS male_parent_rel ON(accession.stock_id=male_parent_rel.object_id AND male_parent_rel.type_id=$male_parent_cvterm_id)
366 JOIN stock AS male_parent ON(male_parent_rel.subject_id = male_parent.stock_id AND male_parent.type_id=$accession_cvterm_id)
367 WHERE accession.type_id=$accession_cvterm_id AND accession.stock_id IN ($accession_list_string);";
368 my $h = $schema->storage->dbh()->prepare($q);
370 my @accession_stock_ids_found = ();
371 my @female_stock_ids_found = ();
372 my @male_stock_ids_found = ();
373 while (my ($accession_stock_id, $female_parent_stock_id, $male_parent_stock_id) = $h->fetchrow_array()) {
374 push @accession_stock_ids_found, $accession_stock_id;
375 push @female_stock_ids_found, $female_parent_stock_id;
376 push @male_stock_ids_found, $male_parent_stock_id;
379 # print STDERR Dumper \@accession_stock_ids_found;
380 # print STDERR Dumper \@female_stock_ids_found;
381 # print STDERR Dumper \@male_stock_ids_found;
383 @all_individual_accessions_stock_ids = @accession_stock_ids_found;
385 for my $i (0..scalar(@accession_stock_ids_found)-1) {
386 my $female_stock_id = $female_stock_ids_found[$i];
387 my $male_stock_id = $male_stock_ids_found[$i];
388 my $accession_stock_id = $accession_stock_ids_found[$i];
390 my $dataset = CXGN
::Dataset
::Cache
->new({
391 people_schema
=>$people_schema,
393 cache_root
=>$cache_root_dir,
394 accessions
=>[$female_stock_id, $male_stock_id]
396 my $genotypes = $dataset->retrieve_genotypes($protocol_id, ['DS'], ['markers'], ['name', 'chrom', 'pos'], 1, [], undef, undef, []);
398 if (scalar(@
$genotypes) > 0) {
399 # For old genotyping protocols without nd_protocolprop info...
400 if (scalar(@all_marker_objects) == 0) {
401 foreach my $o (sort genosort
keys %{$genotypes->[0]->{selected_genotype_hash
}}) {
402 push @all_marker_objects, {name
=> $o};
406 my $geno = $genotypes->[0];
408 my $genotype_string = "";
410 $genotype_string .= "ID\t";
411 foreach my $m (@all_marker_objects) {
412 $genotype_string .= $m->{name
} . "\t";
414 $genotype_string .= "\n";
415 $genotype_string .= "CHROM\t";
416 foreach my $m (@all_marker_objects) {
417 $genotype_string .= $geno->{selected_protocol_hash
}->{markers
}->{$m->{name
}}->{chrom
} . " \t";
419 $genotype_string .= "\n";
420 $genotype_string .= "POS\t";
421 foreach my $m (@all_marker_objects) {
422 $genotype_string .= $geno->{selected_protocol_hash
}->{markers
}->{$m->{name
}}->{pos} . " \t";
424 $genotype_string .= "\n";
426 my $geno_hybrid = CXGN
::Genotype
::ComputeHybridGenotype
->new({
427 parental_genotypes
=>$genotypes,
428 marker_objects
=>\
@all_marker_objects
430 my $progeny_genotype = $geno_hybrid->get_hybrid_genotype();
432 push @individuals_stock_ids, $accession_stock_id;
433 my $genotype_string_scores = join "\t", @
$progeny_genotype;
434 $genotype_string .= $accession_stock_id."\t".$genotype_string_scores."\n";
435 write_file
($grm_tempfile, {append
=> 1}, $genotype_string);
436 undef $progeny_genotype;
442 my $transpose_tempfile = $grm_tempfile . "_transpose";
444 my $tmp_output_dir = $shared_cluster_dir_config."/tmp_genotype_download_gwas";
445 mkdir $tmp_output_dir if ! -d
$tmp_output_dir;
447 my $transpose_cmd = CXGN
::Tools
::Run
->new(
449 backend
=> $backend_config,
450 submit_host
=> $cluster_host_config,
451 temp_base
=> $tmp_output_dir,
452 queue
=> $web_cluster_queue_config,
454 out_file
=> $transpose_tempfile,
455 # out_file => $transpose_tempfile,
456 # don't block and wait if the cluster looks full
457 max_cluster_jobs
=> 1_000_000_000
,
461 # Do the transposition job on the cluster
462 $transpose_cmd->run_cluster(
464 $basepath_config."/bin/transpose_matrix.pl",
467 $transpose_cmd->is_cluster(1);
468 $transpose_cmd->wait;
470 # print STDERR Dumper \@all_marker_names;
471 # print STDERR Dumper \@individuals_stock_ids;
472 # print STDERR Dumper \@dosage_matrix;
474 my $maf = $self->minor_allele_frequency();
475 my $marker_filter = $self->marker_filter();
476 my $individuals_filter = $self->individuals_filter();
478 my $cmd = 'R -e "library(genoDataFilter); library(rrBLUP); library(data.table); library(scales);
479 pheno <- fread(\''.$pheno_tempfile.'\', header=TRUE, sep=\',\');
480 pheno\$field_trial_id <- as.factor(pheno\$field_trial_id);
481 pheno\$replicate <- as.factor(pheno\$replicate);
482 geno_mat_marker_first <- fread(\''.$transpose_tempfile.'\', header=TRUE, sep=\'\t\') #has sample names as column names, first 3 columns are marker info;
483 geno_mat_sample_first <- data.frame(fread(\''.$grm_tempfile.'\', header=FALSE, sep=\'\t\', skip=3)) #has sample names in first column, no defined column names;
484 sample_names <- geno_mat_sample_first\$V1; #no defined column names but they are markers
485 geno_mat_sample_first <- geno_mat_sample_first[,-1]; #remove first column so that row names are sample names and all other columns are markers
486 geno_mat_sample_first <- as.data.frame(rescale(as.matrix(geno_mat_sample_first), to = c(-1,1) ) ); #rrBLUP expected -1 to 1
487 colnames(geno_mat_sample_first) <- geno_mat_marker_first\$ID; #has sample names as row names, column names are marker names
488 row.names(geno_mat_sample_first) <- sample_names;
489 mat_clean_sample_first <- filterGenoData(gData=geno_mat_sample_first, maf='.$maf.', markerFilter='.$marker_filter.', indFilter='.$individuals_filter.');
490 if (\'rn\' %in% colnames(mat_clean_sample_first)) { row.names(mat_clean_sample_first) <- mat_clean_sample_first\$rn; mat_clean_sample_first <- mat_clean_sample_first[,-1]; }
491 remaining_samples <- row.names(mat_clean_sample_first);
492 remaining_markers <- colnames(mat_clean_sample_first);
493 imputation <- A.mat(mat_clean_sample_first, impute.method=\'EM\', n.core='.$number_system_cores.', return.imputed=TRUE);
494 K.mat <- imputation\$A;
495 geno_imputed <- imputation\$imputed;
496 geno_gwas <- cbind(geno_mat_marker_first[geno_mat_marker_first\$ID %in% remaining_markers, c(1:3)], t(geno_imputed));
497 gwas_results <- GWAS(pheno[pheno\$gid %in% remaining_samples, ], geno_gwas, fixed=c(\'field_trial_id\',\'replicate\'), K=K.mat, plot=F, min.MAF='.$maf.'); #columns are ID,CHROM,POS,TraitIDs and values in TraitIDs column are -log10 p values'."\n";
498 if ($download_format eq 'manhattan_qq_plots') {
499 $cmd .= 'pdf( \''.$gwas_tempfile.'\', width = 11, height = 8.5 );
500 for (i in 4:length(gwas_results)) { alpha_bonferroni=-log10(0.05/length(gwas_results[,i])); chromosome_ids <- as.factor(gwas_results\$CHROM); marker_indicator <- match(unique(gwas_results\$CHROM), gwas_results\$CHROM); N <- length(gwas_results[,1]); plot(seq(1:N), gwas_results[,i], col=chromosome_ids, ylab=\'-log10(pvalue)\', main=paste(\'Manhattan Plot \',colnames(gwas_results)[i]), xaxt=\'n\', xlab=\'Position\', ylim=c(0,14)); axis(1,at=marker_indicator,labels=gwas_results\$CHROM[marker_indicator], cex.axis=0.8, las=2); abline(h=alpha_bonferroni,col=\'red\',lwd=2); expected.logvalues <- sort( -log10( c(1:N) * (1/N) ) ); observed.logvalues <- sort(gwas_results[,i]); plot(expected.logvalues, observed.logvalues, main=paste(\'QQ Plot \',colnames(gwas_results)[i]), xlab=\'Expected -log p-values \', ylab=\'Observed -log p-values\', col.main=\'black\', col=\'coral1\', pch=20); abline(0,1,lwd=3,col=\'black\'); }
504 elsif ($download_format eq 'results_tsv') {
505 $cmd .= 'write.table(gwas_results, file=\''.$gwas_tempfile.'\', row.names=FALSE, col.names=TRUE, sep=\'\t\');
508 print STDERR Dumper
$cmd;
510 # Do the GWAS on the cluster
511 my $gwas_cmd = CXGN
::Tools
::Run
->new(
513 backend
=> $backend_config,
514 submit_host
=> $cluster_host_config,
515 temp_base
=> $tmp_output_dir,
516 queue
=> $web_cluster_queue_config,
518 out_file
=> $gwas_tempfile,
519 # don't block and wait if the cluster looks full
520 max_cluster_jobs
=> 1_000_000_000
,
524 # Do the transposition job on the cluster
525 $gwas_cmd->run_cluster($cmd);
526 $gwas_cmd->is_cluster(1);
530 return ($gwas_tempfile, $status);
535 my $datatype = shift;
537 #print STDERR Dumper($self->_get_dataref());
538 my $json = JSON
->new();
539 #preserve order of hash keys to get same text
540 $json = $json->canonical();
541 my $accessions = $json->encode( $self->accession_id_list() || [] );
542 my $traits = $json->encode( $self->trait_id_list() || [] );
543 my $protocol = $self->protocol_id() || '';
544 my $genotypeprophash = $json->encode( $self->genotypeprop_hash_select() || [] );
545 my $protocolprophash = $json->encode( $self->protocolprop_top_key_select() || [] );
546 my $protocolpropmarkerhash = $json->encode( $self->protocolprop_marker_hash_select() || [] );
547 my $maf = $self->minor_allele_frequency();
548 my $marker_filter = $self->marker_filter();
549 my $download_format = $self->download_format();
550 my $individuals_filter = $self->individuals_filter();
551 my $key = md5_hex
($accessions.$traits.$protocol.$genotypeprophash.$protocolprophash.$protocolpropmarkerhash.$self->get_grm_for_parental_accessions().$self->return_only_first_genotypeprop_for_stock()."_MAF$maf"."_mfilter$marker_filter"."_ifilter$individuals_filter"."repeated".$self->traits_are_repeated_measurements()."format$download_format"."_$datatype");
557 my $shared_cluster_dir_config = shift;
558 my $backend_config = shift;
559 my $cluster_host_config = shift;
560 my $web_cluster_queue_config = shift;
561 my $basepath_config = shift;
563 my $key = $self->grm_cache_key("download_gwas_v01");
564 $self->_cache_key($key);
565 $self->cache( Cache
::File
->new( cache_root
=> $self->cache_root() ));
568 if ($self->cache()->exists($key)) {
569 $return = $self->cache()->handle($key);
572 my ($gwas_tempfile, $status) = $self->get_gwas($shared_cluster_dir_config, $backend_config, $cluster_host_config, $web_cluster_queue_config, $basepath_config);
574 open my $out_copy, '<', $gwas_tempfile or die "Can't open output file: $!";
576 $self->cache()->set($key, '');
577 my $file_handle = $self->cache()->handle($key);
578 copy
($out_copy, $file_handle);
581 $return = $self->cache()->handle($key);
587 my ($a_chr, $a_pos, $b_chr, $b_pos);
588 if ($a =~ m/S(\d+)\_(.*)/) {
592 if ($b =~ m/S(\d+)\_(.*)/) {
597 if ($a_chr && $b_chr) {
598 if ($a_chr == $b_chr) {
599 return $a_pos <=> $b_pos;
601 return $a_chr <=> $b_chr;