lib/CXGN/Genotype/GWAS.pm

   1 package CXGN::Genotype::GWAS;
   2
   3 =head1 NAME
   4
   5 CXGN::Genotype::GWAS - an object to handle GWAS
   6
   7 =head1 USAGE
   8
   9 my $geno = CXGN::Genotype::GWAS->new({
  10     bcs_schema=>$schema,
  11     grm_temp_file=>$file_temp_path,
  12     gwas_temp_file=>$file_temp_path_gwas,
  13     pheno_temp_file=>$file_temp_path_pheno,
  14     people_schema=>$people_schema,
  15     download_format=>$download_format, #either 'results_tsv' or 'manhattan_qq_plots'
  16     accession_id_list=>\@accession_list,
  17     trait_id_list=>\@trait_id_list,
  18     traits_are_repeated_measurements=>$traits_are_repeated_measurements,
  19     protocol_id=>$protocol_id,
  20     get_grm_for_parental_accessions=>1,
  21     cache_root=>$cache_root,
  22     minor_allele_frequency=>0.01,
  23     marker_filter=>0.6,
  24     individuals_filter=>0.8
  25 });
  26 $geno->download_gwas();
  27
  28 =head1 DESCRIPTION
  29
  30
  31 =head1 AUTHORS
  32
  33  Nicolas Morales <nm529@cornell.edu>
  34
  35 =cut
  36
  37 use strict;
  38 use warnings;
  39 use Moose;
  40 use Try::Tiny;
  41 use Data::Dumper;
  42 use SGN::Model::Cvterm;
  43 use CXGN::Trial;
  44 use JSON;
  45 use CXGN::Stock::Accession;
  46 use CXGN::Genotype::Protocol;
  47 use CXGN::Genotype::Search;
  48 use CXGN::Genotype::ComputeHybridGenotype;
  49 use CXGN::Phenotypes::SearchFactory;
  50 use CXGN::Page;
  51 use R::YapRI::Base;
  52 use R::YapRI::Data::Matrix;
  53 use CXGN::Dataset::Cache;
  54 use Cache::File;
  55 use Digest::MD5 qw | md5_hex |;
  56 use File::Slurp qw | write_file |;
  57 use File::Temp 'tempfile';
  58 use File::Copy;
  59 use POSIX;
  60
  61 has 'bcs_schema' => (
  62     isa => 'Bio::Chado::Schema',
  63     is => 'rw',
  64     required => 1
  65 );
  66
  67 has 'people_schema' => (
  68     isa => 'CXGN::People::Schema',
  69     is => 'rw',
  70     required => 1
  71 );
  72
  73 # Uses a cached file system for getting genotype results and getting GRM
  74 has 'cache_root' => (
  75     isa => 'Str',
  76     is => 'rw',
  77     required => 1
  78 );
  79
  80 has 'cache' => (
  81     isa => 'Cache::File',
  82     is => 'rw',
  83 );
  84
  85 has 'cache_expiry' => (
  86     isa => 'Int',
  87     is => 'rw',
  88     default => 0, # never expires?
  89 );
  90
  91 has '_cache_key' => (
  92     isa => 'Str',
  93     is => 'rw',
  94 );
  95
  96 has 'grm_temp_file' => (
  97     isa => 'Str',
  98     is => 'rw',
  99     required => 1
 100 );
 101
 102 has 'gwas_temp_file' => (
 103     isa => 'Str',
 104     is => 'rw',
 105     required => 1
 106 );
 107
 108 has 'pheno_temp_file' => (
 109     isa => 'Str',
 110     is => 'rw',
 111     required => 1
 112 );
 113
 114 has 'protocol_id' => (
 115     isa => 'Int',
 116     is => 'rw',
 117     required => 1
 118 );
 119
 120 has 'download_format' => (
 121     isa => 'Str',
 122     is => 'rw',
 123     required => 1
 124 );
 125
 126 has 'minor_allele_frequency' => (
 127     isa => 'Num',
 128     is => 'rw',
 129     default => sub{0.05}
 130 );
 131
 132 has 'marker_filter' => (
 133     isa => 'Num',
 134     is => 'rw',
 135     default => sub{0.60}
 136 );
 137
 138 has 'individuals_filter' => (
 139     isa => 'Num',
 140     is => 'rw',
 141     default => sub{0.80}
 142 );
 143
 144 has 'accession_id_list' => (
 145     isa => 'ArrayRef[Int]|Undef',
 146     is => 'rw'
 147 );
 148
 149 has 'trait_id_list' => (
 150     isa => 'ArrayRef[Int]|Undef',
 151     is => 'rw'
 152 );
 153
 154 has 'traits_are_repeated_measurements' => (
 155     isa => 'Bool',
 156     is => 'ro',
 157     default => 0
 158 );
 159
 160 # If the accessions in the plots you are interested have not been genotyped (as in hybrids), can get this boolean to 1 and give a list of plot_id_list and you will get back a GRM built from the parent accessions for those plots (for the plots whose parents were genotyped)
 161 has 'get_grm_for_parental_accessions' => (
 162     isa => 'Bool',
 163     is => 'ro',
 164     default => 0
 165 );
 166
 167 has 'genotypeprop_hash_select' => (
 168     isa => 'ArrayRef[Str]',
 169     is => 'ro',
 170     default => sub {['DS']} #THESE ARE THE GENERIC AND EXPECTED VCF ATRRIBUTES. For dosage matrix we only need DS
 171 );
 172
 173 has 'protocolprop_top_key_select' => (
 174     isa => 'ArrayRef[Str]',
 175     is => 'ro',
 176     default => sub {['markers']} #THESE ARE ALL POSSIBLE TOP LEVEL KEYS IN PROTOCOLPROP BASED ON VCF LOADING. For dosage matrix we only need markers
 177 );
 178
 179 has 'protocolprop_marker_hash_select' => (
 180     isa => 'ArrayRef[Str]',
 181     is => 'ro',
 182     default => sub {['name']} #THESE ARE ALL POSSIBLE PROTOCOLPROP MARKER HASH KEYS BASED ON VCF LOADING. For dosage matrix we only need name
 183 );
 184
 185 has 'return_only_first_genotypeprop_for_stock' => (
 186     isa => 'Bool',
 187     is => 'ro',
 188     default => 1
 189 );
 190
 191 sub get_gwas {
 192     my $self = shift;
 193     my $shared_cluster_dir_config = shift;
 194     my $backend_config = shift;
 195     my $cluster_host_config = shift;
 196     my $web_cluster_queue_config = shift;
 197     my $basepath_config = shift;
 198     my $schema = $self->bcs_schema();
 199     my $people_schema = $self->people_schema();
 200     my $cache_root_dir = $self->cache_root();
 201     my $accession_list = $self->accession_id_list();
 202     my $trait_list = $self->trait_id_list();
 203     my $protocol_id = $self->protocol_id();
 204     my $get_grm_for_parental_accessions = $self->get_grm_for_parental_accessions();
 205     my $grm_tempfile = $self->grm_temp_file();
 206     my $gwas_tempfile = $self->gwas_temp_file();
 207     my $pheno_tempfile = $self->pheno_temp_file();
 208     my $download_format = $self->download_format();
 209     my $traits_are_repeated_measurements = $self->traits_are_repeated_measurements();
 210
 211     my $accession_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'accession', 'stock_type')->cvterm_id();
 212     my $plot_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'plot', 'stock_type')->cvterm_id();
 213     my $plot_of_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'plot_of', 'stock_relationship')->cvterm_id();
 214     my $female_parent_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'female_parent', 'stock_relationship')->cvterm_id();
 215     my $male_parent_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'male_parent', 'stock_relationship')->cvterm_id();
 216
 217     my $number_system_cores = `getconf _NPROCESSORS_ONLN` or die "Could not get number of system cores!\n";
 218     chomp($number_system_cores);
 219     print STDERR "NUMCORES $number_system_cores\n";
 220
 221     my $phenotypes_search = CXGN::Phenotypes::SearchFactory->instantiate(
 222         'MaterializedViewTable',
 223         {
 224             bcs_schema=>$schema,
 225             data_level=>'plot',
 226             trait_list=>$trait_list,
 227             accession_list=>$accession_list,
 228             exclude_phenotype_outlier=>0,
 229             include_timestamp=>0
 230         }
 231     );
 232     my ($data, $unique_traits) = $phenotypes_search->search();
 233     my %unique_trait_ids;
 234     my %unique_observation_units;
 235     my %phenotype_data_hash;
 236     my %filtered_accession_ids;
 237     foreach my $d (@$data) {
 238         $unique_observation_units{$d->{observationunit_stock_id}} = $d;
 239         foreach my $o (@{$d->{observations}}) {
 240             my $trait_id_use;
 241             if ($traits_are_repeated_measurements) {
 242                 $trait_id_use = '0001';
 243             }
 244             else {
 245                 $trait_id_use = $o->{trait_id};
 246             }
 247             $unique_trait_ids{$trait_id_use}++;
 248             if ($o->{value} || $o->{value} == 0) {
 249                 $phenotype_data_hash{$d->{observationunit_stock_id}}->{$trait_id_use} = $o->{value};
 250                 $filtered_accession_ids{$d->{germplasm_stock_id}}++;
 251             }
 252         }
 253     }
 254     my @unique_stock_ids_sorted = sort keys %unique_observation_units;
 255     my @unique_trait_ids_sorted = sort keys %unique_trait_ids;
 256     my @unique_accession_ids_sorted = sort keys %filtered_accession_ids;
 257     $accession_list = \@unique_accession_ids_sorted;
 258
 259     my $trait_string_sql = join ',', @unique_trait_ids_sorted;
 260     open(my $F_pheno, ">", $pheno_tempfile) || die "Can't open file ".$pheno_tempfile;
 261         print $F_pheno 'gid,field_trial_id,replicate,'.$trait_string_sql."\n";
 262         foreach my $stock_id (@unique_stock_ids_sorted) {
 263             my $d = $unique_observation_units{$stock_id};
 264             print $F_pheno $d->{germplasm_stock_id}.','.$d->{trial_id}.','.$d->{obsunit_rep};
 265             foreach my $t (@unique_trait_ids_sorted) {
 266                 my $pheno_val = $phenotype_data_hash{$stock_id}->{$t} || '';
 267                 print $F_pheno ','.$pheno_val;
 268             }
 269             print $F_pheno "\n";
 270         }
 271     close($F_pheno);
 272
 273     my $protocol = CXGN::Genotype::Protocol->new({
 274         bcs_schema => $schema,
 275         nd_protocol_id => $protocol_id
 276     });
 277     my $markers = $protocol->markers;
 278     my @all_marker_objects = values %$markers;
 279     if (scalar(@all_marker_objects) > 10000) {
 280         my $page = CXGN::Page->new();
 281         $page->message_page('GWAS Error', 'Please choose less than 10,000 markers');
 282         print STDERR "Error: for GWAS please choose less than 10,000 markers";
 283     }
 284
 285     no warnings 'uninitialized';
 286     @all_marker_objects = sort { $a->{chrom} cmp $b->{chrom} || $a->{pos} <=> $b->{pos} || $a->{name} cmp $b->{name} } @all_marker_objects;
 287
 288     my @individuals_stock_ids;
 289     my @all_individual_accessions_stock_ids;
 290     my $counter = 0;
 291
 292     # In this case a list of accessions is given, so get a GRM between these accessions
 293     if ($accession_list && scalar(@$accession_list)>0 && !$get_grm_for_parental_accessions){
 294         @all_individual_accessions_stock_ids = @$accession_list;
 295         if (scalar(@all_individual_accessions_stock_ids) > 100) {
 296             my $page = CXGN::Page->new();
 297             $page->message_page('GWAS Error', 'Please choose less than 100 accessions');
 298             print STDERR "Error: for GWAS please choose less than 100 accessions";
 299         }
 300         foreach (@$accession_list) {
 301             my $dataset = CXGN::Dataset::Cache->new({
 302                 people_schema=>$people_schema,
 303                 schema=>$schema,
 304                 cache_root=>$cache_root_dir,
 305                 accessions=>[$_]
 306             });
 307             my $genotypes = $dataset->retrieve_genotypes($protocol_id, ['DS'], ['markers'], ['name','chrom','pos'], 1, [], undef, undef, []);
 308
 309             if (scalar(@$genotypes)>0) {
 310
 311                 # For old genotyping protocols without nd_protocolprop info...
 312                 if (scalar(@all_marker_objects) == 0) {
 313                     my $position_placeholder = 1;
 314                     foreach my $o (sort genosort keys %{$genotypes->[0]->{selected_genotype_hash}}) {
 315                         push @all_marker_objects, {name => $o, chrom => '1', pos => $position_placeholder};
 316                         $position_placeholder++;
 317                     }
 318                 }
 319
 320                 foreach my $p (0..scalar(@$genotypes)-1) {
 321                     my $geno = $genotypes->[$p];
 322
 323                     my $genotype_string = "";
 324                     if ($counter == 0) {
 325                         $genotype_string .= "ID\t";
 326                         foreach my $m (@all_marker_objects) {
 327                             $genotype_string .= $m->{name} . "\t";
 328                         }
 329                         $genotype_string .= "\n";
 330                         $genotype_string .= "CHROM\t";
 331                         foreach my $m (@all_marker_objects) {
 332                             $genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{chrom} . " \t";
 333                         }
 334                         $genotype_string .= "\n";
 335                         $genotype_string .= "POS\t";
 336                         foreach my $m (@all_marker_objects) {
 337                             $genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{pos} . " \t";
 338                         }
 339                         $genotype_string .= "\n";
 340                     }
 341                     my $genotype_id = $geno->{stock_id};
 342                     my $genotype_data_string = "";
 343                     foreach my $m (@all_marker_objects) {
 344                         $genotype_data_string .= $geno->{selected_genotype_hash}->{$m->{name}}->{'DS'}."\t";
 345                     }
 346                     $genotype_string .= $genotype_id."\t".$genotype_data_string."\n";
 347
 348                     push @individuals_stock_ids, $genotype_id;
 349                     write_file($grm_tempfile, {append => 1}, $genotype_string);
 350                     undef $genotypes->[$p];
 351                     $counter++;
 352                 }
 353                 undef $genotypes;
 354             }
 355         }
 356     }
 357     # IN this case of a hybrid evaluation where the parents of the accessions planted in a plot are genotyped
 358     elsif ($get_grm_for_parental_accessions && $accession_list && scalar(@$accession_list)>0) {
 359         print STDERR "COMPUTING GENOTYPE FROM PARENTS FOR ACCESSIONS\n";
 360         my $accession_list_string = join ',', @$accession_list;
 361         my $q = "SELECT accession.stock_id, female_parent.stock_id, male_parent.stock_id
 362             FROM stock AS accession
 363             JOIN stock_relationship AS female_parent_rel ON(accession.stock_id=female_parent_rel.object_id AND female_parent_rel.type_id=$female_parent_cvterm_id)
 364             JOIN stock AS female_parent ON(female_parent_rel.subject_id = female_parent.stock_id AND female_parent.type_id=$accession_cvterm_id)
 365             JOIN stock_relationship AS male_parent_rel ON(accession.stock_id=male_parent_rel.object_id AND male_parent_rel.type_id=$male_parent_cvterm_id)
 366             JOIN stock AS male_parent ON(male_parent_rel.subject_id = male_parent.stock_id AND male_parent.type_id=$accession_cvterm_id)
 367             WHERE accession.type_id=$accession_cvterm_id AND accession.stock_id IN ($accession_list_string);";
 368         my $h = $schema->storage->dbh()->prepare($q);
 369         $h->execute();
 370         my @accession_stock_ids_found = ();
 371         my @female_stock_ids_found = ();
 372         my @male_stock_ids_found = ();
 373         while (my ($accession_stock_id, $female_parent_stock_id, $male_parent_stock_id) = $h->fetchrow_array()) {
 374             push @accession_stock_ids_found, $accession_stock_id;
 375             push @female_stock_ids_found, $female_parent_stock_id;
 376             push @male_stock_ids_found, $male_parent_stock_id;
 377         }
 378
 379         # print STDERR Dumper \@accession_stock_ids_found;
 380         # print STDERR Dumper \@female_stock_ids_found;
 381         # print STDERR Dumper \@male_stock_ids_found;
 382
 383         @all_individual_accessions_stock_ids = @accession_stock_ids_found;
 384
 385         for my $i (0..scalar(@accession_stock_ids_found)-1) {
 386             my $female_stock_id = $female_stock_ids_found[$i];
 387             my $male_stock_id = $male_stock_ids_found[$i];
 388             my $accession_stock_id = $accession_stock_ids_found[$i];
 389
 390             my $dataset = CXGN::Dataset::Cache->new({
 391                 people_schema=>$people_schema,
 392                 schema=>$schema,
 393                 cache_root=>$cache_root_dir,
 394                 accessions=>[$female_stock_id, $male_stock_id]
 395             });
 396             my $genotypes = $dataset->retrieve_genotypes($protocol_id, ['DS'], ['markers'], ['name', 'chrom', 'pos'], 1, [], undef, undef, []);
 397
 398             if (scalar(@$genotypes) > 0) {
 399                 # For old genotyping protocols without nd_protocolprop info...
 400                 if (scalar(@all_marker_objects) == 0) {
 401                     foreach my $o (sort genosort keys %{$genotypes->[0]->{selected_genotype_hash}}) {
 402                         push @all_marker_objects, {name => $o};
 403                     }
 404                 }
 405
 406                 my $geno = $genotypes->[0];
 407
 408                 my $genotype_string = "";
 409                 if ($counter == 0) {
 410                     $genotype_string .= "ID\t";
 411                     foreach my $m (@all_marker_objects) {
 412                         $genotype_string .= $m->{name} . "\t";
 413                     }
 414                     $genotype_string .= "\n";
 415                     $genotype_string .= "CHROM\t";
 416                     foreach my $m (@all_marker_objects) {
 417                         $genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{chrom} . " \t";
 418                     }
 419                     $genotype_string .= "\n";
 420                     $genotype_string .= "POS\t";
 421                     foreach my $m (@all_marker_objects) {
 422                         $genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{pos} . " \t";
 423                     }
 424                     $genotype_string .= "\n";
 425                 }
 426                 my $geno_hybrid = CXGN::Genotype::ComputeHybridGenotype->new({
 427                     parental_genotypes=>$genotypes,
 428                     marker_objects=>\@all_marker_objects
 429                 });
 430                 my $progeny_genotype = $geno_hybrid->get_hybrid_genotype();
 431
 432                 push @individuals_stock_ids, $accession_stock_id;
 433                 my $genotype_string_scores = join "\t", @$progeny_genotype;
 434                 $genotype_string .= $accession_stock_id."\t".$genotype_string_scores."\n";
 435                 write_file($grm_tempfile, {append => 1}, $genotype_string);
 436                 undef $progeny_genotype;
 437                 $counter++;
 438             }
 439         }
 440     }
 441
 442     my $transpose_tempfile = $grm_tempfile . "_transpose";
 443
 444     my $tmp_output_dir = $shared_cluster_dir_config."/tmp_genotype_download_gwas";
 445     mkdir $tmp_output_dir if ! -d $tmp_output_dir;
 446
 447     my $transpose_cmd = CXGN::Tools::Run->new(
 448         {
 449             backend => $backend_config,
 450             submit_host => $cluster_host_config,
 451             temp_base => $tmp_output_dir,
 452             queue => $web_cluster_queue_config,
 453             do_cleanup => 0,
 454             out_file => $transpose_tempfile,
 455 #            out_file => $transpose_tempfile,
 456             # don't block and wait if the cluster looks full
 457             max_cluster_jobs => 1_000_000_000,
 458         }
 459     );
 460
 461     # Do the transposition job on the cluster
 462     $transpose_cmd->run_cluster(
 463             "perl ",
 464             $basepath_config."/bin/transpose_matrix.pl",
 465             $grm_tempfile,
 466     );
 467     $transpose_cmd->is_cluster(1);
 468     $transpose_cmd->wait;
 469
 470     # print STDERR Dumper \@all_marker_names;
 471     # print STDERR Dumper \@individuals_stock_ids;
 472     # print STDERR Dumper \@dosage_matrix;
 473
 474     my $maf = $self->minor_allele_frequency();
 475     my $marker_filter = $self->marker_filter();
 476     my $individuals_filter = $self->individuals_filter();
 477
 478     my $cmd = 'R -e "library(genoDataFilter); library(rrBLUP); library(data.table); library(scales);
 479     pheno <- fread(\''.$pheno_tempfile.'\', header=TRUE, sep=\',\');
 480     pheno\$field_trial_id <- as.factor(pheno\$field_trial_id);
 481     pheno\$replicate <- as.factor(pheno\$replicate);
 482     geno_mat_marker_first <- fread(\''.$transpose_tempfile.'\', header=TRUE, sep=\'\t\') #has sample names as column names, first 3 columns are marker info;
 483     geno_mat_sample_first <- data.frame(fread(\''.$grm_tempfile.'\', header=FALSE, sep=\'\t\', skip=3)) #has sample names in first column, no defined column names;
 484     sample_names <- geno_mat_sample_first\$V1; #no defined column names but they are markers
 485     geno_mat_sample_first <- geno_mat_sample_first[,-1]; #remove first column so that row names are sample names and all other columns are markers
 486     geno_mat_sample_first <- as.data.frame(rescale(as.matrix(geno_mat_sample_first), to = c(-1,1) ) ); #rrBLUP expected -1 to 1
 487     colnames(geno_mat_sample_first) <- geno_mat_marker_first\$ID; #has sample names as row names, column names are marker names
 488     row.names(geno_mat_sample_first) <- sample_names;
 489     mat_clean_sample_first <- filterGenoData(gData=geno_mat_sample_first, maf='.$maf.', markerFilter='.$marker_filter.', indFilter='.$individuals_filter.');
 490     if (\'rn\' %in% colnames(mat_clean_sample_first)) { row.names(mat_clean_sample_first) <- mat_clean_sample_first\$rn; mat_clean_sample_first <- mat_clean_sample_first[,-1]; }
 491     remaining_samples <- row.names(mat_clean_sample_first);
 492     remaining_markers <- colnames(mat_clean_sample_first);
 493     imputation <- A.mat(mat_clean_sample_first, impute.method=\'EM\', n.core='.$number_system_cores.', return.imputed=TRUE);
 494     K.mat <- imputation\$A;
 495     geno_imputed <- imputation\$imputed;
 496     geno_gwas <- cbind(geno_mat_marker_first[geno_mat_marker_first\$ID %in% remaining_markers, c(1:3)], t(geno_imputed));
 497     gwas_results <- GWAS(pheno[pheno\$gid %in% remaining_samples, ], geno_gwas, fixed=c(\'field_trial_id\',\'replicate\'), K=K.mat, plot=F, min.MAF='.$maf.'); #columns are ID,CHROM,POS,TraitIDs and values in TraitIDs column are -log10 p values'."\n";
 498     if ($download_format eq 'manhattan_qq_plots') {
 499         $cmd .= 'pdf( \''.$gwas_tempfile.'\', width = 11, height = 8.5 );
 500         for (i in 4:length(gwas_results)) { alpha_bonferroni=-log10(0.05/length(gwas_results[,i])); chromosome_ids <- as.factor(gwas_results\$CHROM); marker_indicator <- match(unique(gwas_results\$CHROM), gwas_results\$CHROM); N <- length(gwas_results[,1]); plot(seq(1:N), gwas_results[,i], col=chromosome_ids, ylab=\'-log10(pvalue)\', main=paste(\'Manhattan Plot \',colnames(gwas_results)[i]), xaxt=\'n\', xlab=\'Position\', ylim=c(0,14)); axis(1,at=marker_indicator,labels=gwas_results\$CHROM[marker_indicator], cex.axis=0.8, las=2); abline(h=alpha_bonferroni,col=\'red\',lwd=2); expected.logvalues <- sort( -log10( c(1:N) * (1/N) ) ); observed.logvalues <- sort(gwas_results[,i]); plot(expected.logvalues, observed.logvalues, main=paste(\'QQ Plot \',colnames(gwas_results)[i]), xlab=\'Expected -log p-values \', ylab=\'Observed -log p-values\', col.main=\'black\', col=\'coral1\', pch=20); abline(0,1,lwd=3,col=\'black\'); }
 501         dev.off();
 502         "';
 503     }
 504     elsif ($download_format eq 'results_tsv') {
 505         $cmd .= 'write.table(gwas_results, file=\''.$gwas_tempfile.'\', row.names=FALSE, col.names=TRUE, sep=\'\t\');
 506         "';
 507     }
 508     print STDERR Dumper $cmd;
 509
 510     # Do the GWAS on the cluster
 511     my $gwas_cmd = CXGN::Tools::Run->new(
 512         {
 513             backend => $backend_config,
 514             submit_host => $cluster_host_config,
 515             temp_base => $tmp_output_dir,
 516             queue => $web_cluster_queue_config,
 517             do_cleanup => 0,
 518             out_file => $gwas_tempfile,
 519             # don't block and wait if the cluster looks full
 520             max_cluster_jobs => 1_000_000_000,
 521         }
 522     );
 523
 524     # Do the transposition job on the cluster
 525     $gwas_cmd->run_cluster($cmd);
 526     $gwas_cmd->is_cluster(1);
 527     $gwas_cmd->wait;
 528     my $status;
 529
 530     return ($gwas_tempfile, $status);
 531 }
 532
 533 sub grm_cache_key {
 534     my $self = shift;
 535     my $datatype = shift;
 536
 537     #print STDERR Dumper($self->_get_dataref());
 538     my $json = JSON->new();
 539     #preserve order of hash keys to get same text
 540     $json = $json->canonical();
 541     my $accessions = $json->encode( $self->accession_id_list() || [] );
 542     my $traits = $json->encode( $self->trait_id_list() || [] );
 543     my $protocol = $self->protocol_id() || '';
 544     my $genotypeprophash = $json->encode( $self->genotypeprop_hash_select() || [] );
 545     my $protocolprophash = $json->encode( $self->protocolprop_top_key_select() || [] );
 546     my $protocolpropmarkerhash = $json->encode( $self->protocolprop_marker_hash_select() || [] );
 547     my $maf = $self->minor_allele_frequency();
 548     my $marker_filter = $self->marker_filter();
 549     my $download_format = $self->download_format();
 550     my $individuals_filter = $self->individuals_filter();
 551     my $key = md5_hex($accessions.$traits.$protocol.$genotypeprophash.$protocolprophash.$protocolpropmarkerhash.$self->get_grm_for_parental_accessions().$self->return_only_first_genotypeprop_for_stock()."_MAF$maf"."_mfilter$marker_filter"."_ifilter$individuals_filter"."repeated".$self->traits_are_repeated_measurements()."format$download_format"."_$datatype");
 552     return $key;
 553 }
 554
 555 sub download_gwas {
 556     my $self = shift;
 557     my $shared_cluster_dir_config = shift;
 558     my $backend_config = shift;
 559     my $cluster_host_config = shift;
 560     my $web_cluster_queue_config = shift;
 561     my $basepath_config = shift;
 562
 563     my $key = $self->grm_cache_key("download_gwas_v01");
 564     $self->_cache_key($key);
 565     $self->cache( Cache::File->new( cache_root => $self->cache_root() ));
 566
 567     my $return;
 568     if ($self->cache()->exists($key)) {
 569         $return = $self->cache()->handle($key);
 570     }
 571     else {
 572         my ($gwas_tempfile, $status) = $self->get_gwas($shared_cluster_dir_config, $backend_config, $cluster_host_config, $web_cluster_queue_config, $basepath_config);
 573
 574         open my $out_copy, '<', $gwas_tempfile or die "Can't open output file: $!";
 575
 576         $self->cache()->set($key, '');
 577         my $file_handle = $self->cache()->handle($key);
 578         copy($out_copy, $file_handle);
 579
 580         close $out_copy;
 581         $return = $self->cache()->handle($key);
 582     }
 583     return $return;
 584 }
 585
 586 sub genosort {
 587     my ($a_chr, $a_pos, $b_chr, $b_pos);
 588     if ($a =~ m/S(\d+)\_(.*)/) {
 589         $a_chr = $1;
 590         $a_pos = $2;
 591     }
 592     if ($b =~ m/S(\d+)\_(.*)/) {
 593         $b_chr = $1;
 594         $b_pos = $2;
 595     }
 596
 597     if ($a_chr && $b_chr) {
 598         if ($a_chr == $b_chr) {
 599             return $a_pos <=> $b_pos;
 600         }
 601         return $a_chr <=> $b_chr;
 602     } else {
 603         return -1;
 604     }
 605 }
 606
 607 1;