Merge pull request #5163 from solgenomics/audit-error-checking
[sgn.git] / lib / CXGN / MixedModels.pm
blobd3db774dfeb819015f28b6f0981ea73f65bf829c
2 =head1 NAME
4 CXGN::MixedModels - a package to run user-specified mixed models
6 =head1 DESCRIPTION
8 my $mm = CXGN::MixedModels->new();
9 my $mm->phenotype_file("t/data/phenotype_data.csv");
10 my $mm->dependent_variables(qw | |);
11 my $mm->fixed_factors( qw | | );
12 my $mm->random_factors( qw| | );
13 my $mm->traits( qw| | );
15 =head1 AUTHOR
17 Lukas Mueller <lam87@cornell.edu>
19 =head1 METHODS
21 =cut
23 package CXGN::MixedModels;
25 use Moose;
26 use Data::Dumper;
27 use File::Basename;
28 use File::Copy;
29 use CXGN::Tools::Run;
30 use CXGN::Phenotypes::File;
32 =head2 dependent_variables()
34 sets the dependent variables (a listref of traits)
36 =cut
38 has 'dependent_variables' => (is => 'rw', isa => 'ArrayRef[Str]|Undef');
40 =head2 fixed_factors()
42 sets the fixed factors (listref)
44 =cut
46 has 'fixed_factors' => (is => 'rw', isa => 'Ref', default => sub {[]});
48 =head2 fixed_factors_interaction()
50 sets the fixed factors with interaction (listref)
52 =cut
54 has 'fixed_factors_interaction' => (is => 'rw', isa => 'Ref', default => sub{[]});
56 =head2 random_factors_interaction()
58 sets the random factors with interaction (listref)
60 =cut
62 has 'random_factors_interaction' => (is => 'rw', isa => 'Ref', default => sub{[]});
64 =head2 variable_slope_factors()
66 =cut
68 has 'variable_slope_factors' => (is => 'rw', isa => 'Ref', default => sub{[]});
70 =head2 random_factors()
72 =cut
74 has 'random_factors' => (is => 'rw', isa => 'Ref', default => sub {[]});
76 =head2 variable_slop_intersects
78 =cut
80 has 'variable_slope_intersects' => (is => 'rw', isa => 'Ref', default => sub {[]});
82 =head2 traits()
84 sets the traits
86 =cut
88 has 'traits' => (is => 'rw', isa => 'Ref');
90 =head2 levels()
93 =head2 engine()
95 sets the engine. Either sommer or lme4. Default: lme4.
97 =cut
99 has 'engine' => (is => 'rw', isa => 'Maybe[Str]', default => 'lme4' );
101 has 'levels' => (is => 'rw', isa => 'HashRef' );
103 has 'phenotype_file' => (is => 'rw', isa => 'CXGN::Phenotypes::File|Undef');
105 =head2 tempfile()
107 the tempfile that contains the phenotypic information.
109 =cut
111 has 'tempfile' => (is => 'rw', isa => 'Str|Undef');
113 sub BUILD {
114 my $self = shift;
116 my $phenotype_file;
118 if ($self->tempfile()) {
119 $phenotype_file = CXGN::Phenotypes::File->new( { file => $self->tempfile() } );
121 $self->phenotype_file($phenotype_file);
125 =head2 generate_model()
127 generates the model string, in lme4 format, from the current parameters
129 =cut
131 sub generate_model {
132 my $self = shift;
134 my $tempfile = $self->tempfile();
135 my $dependent_variables = $self->dependent_variables();
136 my $fixed_factors = $self->fixed_factors();
137 my $fixed_factors_interaction = $self->fixed_factors_interaction();
138 my $variable_slope_intersects = $self->variable_slope_intersects();
139 my $random_factors = $self->random_factors();
141 my $error;
143 my @addends = ();
145 print STDERR join("\n", ("DV", Dumper($dependent_variables), "FF", Dumper($fixed_factors), "RF", Dumper($random_factors), "TF", $tempfile, "FFI", Dumper($fixed_factors_interaction), "VSI: ", Dumper($variable_slope_intersects)));
147 print STDERR Dumper($fixed_factors);
148 my $model = "";
150 if (! $dependent_variables || scalar(@$dependent_variables)==0) {
151 die "Need a dependent variable(s) set in CXGN::MixedModels... Ciao!";
154 my $formatted_fixed_factors = "";
155 if (@$fixed_factors) {
156 $formatted_fixed_factors = join(" + ", @$fixed_factors);
157 push @addends, $formatted_fixed_factors;
160 my $formatted_fixed_factors_interaction = "";
161 foreach my $interaction (@$fixed_factors_interaction) {
162 my $terms = "";
163 if (ref($interaction)) {
164 $terms = join("*", @$interaction);
165 push @addends, $terms;
167 else {
168 $error = "Interaction terms are not correctly defined.";
172 my $formatted_variable_slope_intersects = "";
173 foreach my $variable_slope_groups (@$variable_slope_intersects) {
174 if (exists($variable_slope_groups->[0]) && exists($variable_slope_groups->[1])) {
175 my $term = " (1+$variable_slope_groups->[0] \| $variable_slope_groups->[1]) ";
176 print STDERR "TERM: $term\n";
177 $formatted_variable_slope_intersects .= $term;
178 push @addends, $formatted_variable_slope_intersects;
183 my $formatted_random_factors = "";
184 # if ($random_factors_random_slope) {
185 # $formatted_random_factors = " (1 + $random_factors->[0] | $random_factors->[1]) ";
188 # else {
189 foreach my $rf (@$random_factors) {
190 if ($rf) {
191 $formatted_random_factors .= "(1|$rf)" ;
192 print STDERR " formatted random factor now $formatted_random_factors\n";
193 push @addends, $formatted_random_factors;
198 $model .= join(" + ", @addends);
200 return ($model, $error);
203 sub generate_model_sommer {
204 my $self = shift;
206 my $tempfile = $self->tempfile();
207 my @dependent_variables_cleaned = map { make_R_variable_name($_) } @{$self->dependent_variables()};
208 my $dependent_variables = \@dependent_variables_cleaned;
209 my $fixed_factors = $self->fixed_factors();
210 my $fixed_factors_interaction = $self->fixed_factors_interaction();
211 my $random_factors_interaction = $self->random_factors_interaction();
212 my $variable_slope_intersects = $self->variable_slope_intersects();
213 my $random_factors = $self->random_factors();
214 my $formula = "";
215 print STDERR "FIXED FACTORS FED TO GENERATE MODEL SOMMER: ".Dumper($fixed_factors);
216 print STDERR "FIXED Interaction FACTORS FED TO GENERATE MODEL SOMMER: ".Dumper($fixed_factors_interaction);
217 print STDERR "RANDOM Interaction FACTORS FED TO GENERATE MODEL SOMMER: ".Dumper($random_factors_interaction);
219 my $error;
221 ## generate the fixed factor formula
223 my $mmer_fixed_factors = "";
224 my $mmer_random_factors = "";
225 my $mmer_fixed_factors_interaction = "";
226 my $mmer_variable_slope_intersects ="";
228 if (scalar(@$dependent_variables) > 1) { return ("", "For Sommer, only one trait can be analyzed at one time. Please go back and select only one trait or select lme4.") }
230 if (scalar(@$dependent_variables) > 0) {
231 print STDERR "preparing fixed factors...\n";
232 if (scalar(@$fixed_factors) == 0) { $mmer_fixed_factors = "1"; }
233 else { $mmer_fixed_factors = join(" + ", @$fixed_factors); }
235 print STDERR "DEPENDENT VARIABLES: ".Dumper($dependent_variables);
237 $mmer_fixed_factors = make_R_variable_name($dependent_variables->[0]) ." ~ ". $mmer_fixed_factors;
239 if (scalar(@$random_factors)== 0) {$mmer_random_factors = 1; }
241 else {
242 print STDERR "Preparing random factors...\n";
243 $mmer_random_factors = join("+", @$random_factors);
246 if (scalar(@$fixed_factors_interaction)== 0) {$mmer_fixed_factors_interaction = ""; }
248 else {
250 foreach my $interaction(@$fixed_factors_interaction){
253 if (scalar(@$interaction) != 2) { $error = "interaction needs to be pairs :-(";}
256 else { $mmer_fixed_factors_interaction .= " + ". join(":", @$interaction);}
261 #####
262 # if (scalar(@$variable_slope_intersects)== 0) {$mmer_variable_slope_intersects = ""; }
264 # else {
266 # foreach my $intersects(@$variable_slope_intersects){
269 # if (scalar(@$intersects) != 2) { $error = "intersects needs to be pairs :-(";}
270 # #if (scalar(@$random_factors_interaction)== 1) { $error .= "Works only with one interaction for now! :-(";}
272 # else { $mmer_variable_slope_intersects .= " + vsr(". join(",", @$intersects) . ")";} # vsr(Days, Subject)
278 # $mmer_random_factors = " ~ ".$mmer_random_factors ." ".$mmer_fixed_factors_interaction." ".$mmer_variable_slope_intersects;
281 # <<<<<<< HEAD
282 # =======
283 if (scalar(@$variable_slope_intersects)== 0) {$mmer_variable_slope_intersects = ""; }
285 else {
287 foreach my $intersects(@$variable_slope_intersects){
290 if (scalar(@$intersects) != 2) { $error = "intersects needs to be pairs :-(";}
291 #if (scalar(@$random_factors_interaction)== 1) { $error .= "Works only with one interaction for now! :-(";}
293 else { $mmer_variable_slope_intersects .= " + vsr(". join(",", @$intersects) . ")";} # vsr(Days, Subject)
297 if ($mmer_random_factors){
298 $formula = " ~ ".$mmer_random_factors ;
300 if ($mmer_fixed_factors_interaction) {
301 $formula.=" ".$mmer_fixed_factors_interaction;
303 if ($mmer_variable_slope_intersects) {
304 $formula.=" ".$mmer_variable_slope_intersects;
307 # >>>>>>> master
308 #location:genotype
310 print STDERR "mmer_fixed_factors = $mmer_fixed_factors\n";
311 print STDERR "mmer_random_factors = $formula\n";
313 #my $data = { fixed_factors => $mmer_fixed_factors,
314 # random_factors => $mmer_random_factors,
317 my $model = [ $mmer_fixed_factors, $formula ];
319 print STDERR "Data returned from generate_model_sommer: ".Dumper($model);
321 return ($model, $error);
323 else {
324 return ("", $error);
330 =head2 run_model()
332 runs the model along with the data provided in the phenotyping file.
334 Produces two results files. If germplasm was a random factor, is will produce files
335 with the tempfiles stem and the extensions .adjustedBLUPs and .BLUPs, if it was a
336 fixed factor, the extension are .adjustedBLUEs and .BLUEs. The files have the
337 accession identifiers in the first column, with the remaining columns being the
338 adjusted BLUEs, BLUEs, adjusted BLUPs or BLUPs for all the initially selected
339 traits.
341 The files from the dataset will contain trait names that are incompatible with R. These names are converted using the clean_file subroutine. The conversion takes place and is saved in a file with a .clean extension. Then, the .clean file is moved to the previous file name, such that there is not difference in file naming.
344 The result files will initially contain these R-based names as well. The conversion between R and dataset names are stored in a file with the extension .traits . The inital result files will be converted back to dataset names using the sub convert_file_headers_back_to_breedbase_traits() function. The conversion is saved in a file with the .original_traits extension, which is then moved back to the original file name once the conversion is complete.
348 =cut
350 sub run_model {
351 my $self = shift;
352 my $backend = shift || 'Slurm';
353 my $cluster_host = shift || "localhost";
354 my $cluster_shared_tempdir = shift;
356 my $random_factors = '"'.join('","', @{$self->random_factors()}).'"';
357 my $fixed_factors = '"'.join('","',@{$self->fixed_factors()}).'"';
358 my $dependent_variables = '"'.join('","',@{$self->dependent_variables()}).'"';
360 my $model;
361 my $error;
362 my $executable;
364 eval {
366 if ($self->engine() eq "lme4") {
367 ($model, $error) = $self->generate_model();
368 $executable = " R/mixed_models.R ";
371 elsif ($self->engine() eq "sommer") {
372 ($model, $error) = $self->generate_model_sommer();
373 $executable = " R/mixed_models_sommer.R ";
376 if ($error) { die "$error"; }
378 my $dependent_variables_R = make_R_variable_name($dependent_variables);
380 # generate params_file
382 my $param_file = $self->tempfile().".params";
383 open(my $F, ">", $param_file) || die "Can't open $param_file for writing.";
384 print $F "dependent_variables <- c($dependent_variables_R)\n";
385 print $F "random_factors <- c($random_factors)\n";
386 print $F "fixed_factors <- c($fixed_factors)\n";
388 if ($self->engine() eq "lme4") {
389 print $F "model <- \"$model\"\n";
391 elsif ($self->engine() eq "sommer") {
392 print $F "fixed_model <- \"$model->[0]\"\n";
393 print $F "random_model <- \"$model->[1]\"\n";
395 close($F);
397 # clean phenotype file so that trait names are R compatible
399 my $clean_tempfile = $self->clean_file($self->tempfile());
401 # run r script to create model
403 my $cmd = "R CMD BATCH '--args datafile=\"".$clean_tempfile."\" paramfile=\"".$self->tempfile().".params\"' $executable ". $self->tempfile().".out";
404 print STDERR "running R command $cmd...\n";
406 print STDERR "running R command $clean_tempfile...\n";
408 my $ctr = CXGN::Tools::Run->new( { backend => $backend, working_dir => dirname($self->tempfile()), submit_host => $cluster_host } );
411 $ctr->run_cluster($cmd);
413 while ($ctr->alive()) {
414 sleep(1);
417 # replace the R-compatible traits with original trait names
419 print STDERR "Converting files back to non-R headers...\n";
420 foreach my $f (
421 $self->tempfile().".adjustedBLUPs",
422 $self->tempfile().".BLUPs",
423 $self->tempfile().".BLUEs",
424 $self->tempfile().".adjustedBLUEs",
425 $self->tempfile().".anova",
426 $self->tempfile().".varcomp",
429 my $conversion_matrix = $self->read_conversion_matrix($self->tempfile().".traits");
431 if (-e $f) {
432 $self->convert_file_headers_back_to_breedbase_traits($f, $conversion_matrix);
434 else {
435 print STDERR "File $f does not exist, not converting. This may be normal.\n";
440 if ($@) {
441 $error = $@;
444 return $error;
447 =head2 make_R_variable_name
449 Usage:
450 Desc:
451 Ret:
452 Args:
453 Side Effects:
454 Example:
456 =cut
458 sub make_R_variable_name {
459 my $name = shift;
460 $name =~ s/\s/\_/g;
461 $name =~ s/\//\_/g;
462 $name =~ tr/ /./;
463 $name =~ tr/\//./;
464 $name =~ s/\:/\_/g;
465 $name =~ s/\|/\_/g;
466 $name =~ s/\-/\_/g;
468 return $name;
471 sub clean_file {
472 my $self = shift;
473 my $file = shift;
475 open(my $PF, "<", $file) || die "Can't open pheno file ".$file."_phenotype.txt";
476 open(my $CLEAN, ">", $file.".clean") || die "Can't open ".$file.".clean for writing";
478 open(my $TRAITS, ">", $file.".traits") || die "Can't open ".$file.".traits for writing";
481 my $header = <$PF>;
482 chomp($header);
484 my @fields = split /\t/, $header;
486 my @file_traits = @fields[ 39 .. @fields-1 ];
487 my @other_headers = @fields[ 0 .. 38 ];
489 print STDERR "FIELDS: ".Dumper(\@file_traits);
491 foreach my $t (@file_traits) {
492 my $R_t = make_R_variable_name($t);
493 print $TRAITS "$R_t\t$t\n";
494 $t = $R_t;
497 print STDERR "FILE TRAITS: ".Dumper(\@file_traits);
499 my @new_header = (@other_headers, @file_traits);
500 print $CLEAN join("\t", @new_header)."\n";
502 while(<$PF>) {
503 print $CLEAN $_;
506 close($PF);
507 print STDERR "moving $file to $file.before_clean...\n";
508 move($file, $file.".before_clean");
510 print STDERR "moving $file.clean to $file...\n";
511 move($file.".clean", $file);
513 return $file;
517 sub convert_file_headers_back_to_breedbase_traits {
518 my $self = shift;
519 my $file = shift;
520 my $conversion_matrix = shift;
522 open(my $F, "<", $file) || die "Can't open $file\n";
524 print STDERR "Opening ".$self->tempfile().".original_traits for writing...\n";
525 open(my $G, ">", $file.".original_traits") || die "Can't open $file.original_traits";
527 my $header = <$F>;
528 chomp($header);
530 my @fields = split /\t/, $header;
532 foreach my $f (@fields) {
533 if ($conversion_matrix->{$f}) {
534 print STDERR "Converting $f to $conversion_matrix->{$f}...\n";
535 $f = $conversion_matrix->{$f};
540 print $G join("\t", @fields)."\n";
541 while(<$F>) {
542 chomp;
544 # replace NA or . with undef throughout the file
545 # (strings are not accepted by store phenotypes routine
546 # used in analysis storage).
548 my @fields = split /\t/;
549 foreach my $f (@fields) {
550 if ($f eq "NA" || $f eq '.') { $f = undef; }
552 my $line = join("\t", @fields);
553 print $G "$line\n";
555 close($G);
557 print STDERR "move file $file.original_traits back to $file...\n";
558 move($file.".original_traits", $file);
561 sub read_conversion_matrix {
562 my $self = shift;
563 my $file = shift;
565 my $conversion_file = $file;
567 open(my $F, "<", $conversion_file) || die "Can't open file $conversion_file";
569 my %conversion_matrix;
571 while (<$F>) {
572 chomp;
573 my ($new, $old) = split "\t";
574 $conversion_matrix{$new} = $old;
576 return \%conversion_matrix;