interaction on sommer
[sgn.git] / lib / CXGN / MixedModels.pm
blob98d8cda4f52abcffb1374763a0d02ba280f81290
2 =head1 NAME
4 CXGN::MixedModels - a package to run user-specified mixed models
6 =head1 DESCRIPTION
8 my $mm = CXGN::MixedModels->new();
9 my $mm->phenotype_file("t/data/phenotype_data.csv");
10 my $mm->dependent_variables(qw | |);
11 my $mm->fixed_factors( qw | | );
12 my $mm->random_factors( qw| | );
13 my $mm->traits( qw| | );
15 =head1 AUTHOR
17 Lukas Mueller <lam87@cornell.edu>
19 =head1 METHODS
21 =cut
23 package CXGN::MixedModels;
25 use Moose;
26 use Data::Dumper;
27 use File::Slurp qw| slurp |;
28 use File::Basename;
29 use File::Copy;
30 use CXGN::Tools::Run;
31 use CXGN::Phenotypes::File;
33 =head2 dependent_variables()
35 sets the dependent variables (a listref of traits)
37 =cut
39 has 'dependent_variables' => (is => 'rw', isa => 'ArrayRef[Str]|Undef');
41 =head2 fixed_factors()
43 sets the fixed factors (listref)
45 =cut
47 has 'fixed_factors' => (is => 'rw', isa => 'Ref', default => sub {[]});
49 =head2 fixed_factors_interaction()
51 sets the fixed factors with interaction (listref)
53 =cut
55 has 'fixed_factors_interaction' => (is => 'rw', isa => 'Ref', default => sub{[]});
57 =head2 random_factors_interaction()
59 sets the random factors with interaction (listref)
61 =cut
63 has 'random_factors_interaction' => (is => 'rw', isa => 'Ref', default => sub{[]});
65 =head2 variable_slope_factors()
67 =cut
69 has 'variable_slope_factors' => (is => 'rw', isa => 'Ref', default => sub{[]});
71 =head2 random_factors()
73 =cut
75 has 'random_factors' => (is => 'rw', isa => 'Ref', default => sub {[]});
77 =head2 variable_slop_intersects
79 =cut
81 has 'variable_slope_intersects' => (is => 'rw', isa => 'Ref', default => sub {[]});
83 =head2 traits()
85 sets the traits
87 =cut
89 has 'traits' => (is => 'rw', isa => 'Ref');
91 =head2 levels()
94 =head2 engine()
96 sets the engine. Either sommer or lme4. Default: lme4.
98 =cut
100 has 'engine' => (is => 'rw', isa => 'Maybe[Str]', default => 'lme4' );
102 has 'levels' => (is => 'rw', isa => 'HashRef' );
104 has 'phenotype_file' => (is => 'rw', isa => 'CXGN::Phenotypes::File|Undef');
106 =head2 tempfile()
108 the tempfile that contains the phenotypic information.
110 =cut
112 has 'tempfile' => (is => 'rw', isa => 'Str|Undef');
114 sub BUILD {
115 my $self = shift;
117 my $phenotype_file;
119 if ($self->tempfile()) {
120 $phenotype_file = CXGN::Phenotypes::File->new( { file => $self->tempfile() } );
122 $self->phenotype_file($phenotype_file);
126 =head2 generate_model()
128 generates the model string, in lme4 format, from the current parameters
130 =cut
132 sub generate_model {
133 my $self = shift;
135 my $tempfile = $self->tempfile();
136 my $dependent_variables = $self->dependent_variables();
137 my $fixed_factors = $self->fixed_factors();
138 my $fixed_factors_interaction = $self->fixed_factors_interaction();
139 my $variable_slope_intersects = $self->variable_slope_intersects();
140 my $random_factors = $self->random_factors();
142 my $error;
144 my @addends = ();
146 print STDERR join("\n", ("DV", Dumper($dependent_variables), "FF", Dumper($fixed_factors), "RF", Dumper($random_factors), "TF", $tempfile, "FFI", Dumper($fixed_factors_interaction), "VSI: ", Dumper($variable_slope_intersects)));
148 print STDERR Dumper($fixed_factors);
149 my $model = "";
151 if (! $dependent_variables || scalar(@$dependent_variables)==0) {
152 die "Need a dependent variable(s) set in CXGN::MixedModels... Ciao!";
155 my $formatted_fixed_factors = "";
156 if (@$fixed_factors) {
157 $formatted_fixed_factors = join(" + ", @$fixed_factors);
158 push @addends, $formatted_fixed_factors;
161 my $formatted_fixed_factors_interaction = "";
162 foreach my $interaction (@$fixed_factors_interaction) {
163 my $terms = "";
164 if (ref($interaction)) {
165 $terms = join("*", @$interaction);
166 push @addends, $terms;
168 else {
169 $error = "Interaction terms are not correctly defined.";
173 my $formatted_variable_slope_intersects = "";
174 foreach my $variable_slope_groups (@$variable_slope_intersects) {
175 if (exists($variable_slope_groups->[0]) && exists($variable_slope_groups->[1])) {
176 my $term = " (1+$variable_slope_groups->[0] \| $variable_slope_groups->[1]) ";
177 print STDERR "TERM: $term\n";
178 $formatted_variable_slope_intersects .= $term;
179 push @addends, $formatted_variable_slope_intersects;
184 my $formatted_random_factors = "";
185 # if ($random_factors_random_slope) {
186 # $formatted_random_factors = " (1 + $random_factors->[0] | $random_factors->[1]) ";
189 # else {
190 if (@$random_factors) {
191 $formatted_random_factors = join(" + ", map { "(1|$_)" } @$random_factors);
192 push @addends, $formatted_random_factors;
196 $model .= join(" + ", @addends);
198 return ($model, $error);
201 sub generate_model_sommer {
202 my $self = shift;
204 my $tempfile = $self->tempfile();
205 my @dependent_variables_cleaned = map { make_R_variable_name($_) } @{$self->dependent_variables()};
206 my $dependent_variables = \@dependent_variables_cleaned;
207 my $fixed_factors = $self->fixed_factors();
208 my $fixed_factors_interaction = $self->fixed_factors_interaction();
209 my $random_factors_interaction = $self->random_factors_interaction();
210 my $variable_slope_intersects = $self->variable_slope_intersects();
211 my $random_factors = $self->random_factors();
213 print STDERR "FIXED FACTORS FED TO GENERATE MODEL SOMMER: ".Dumper($fixed_factors);
214 print STDERR "FIXED InteractionFACTORS FED TO GENERATE MODEL SOMMER: ".Dumper($fixed_factors_interaction);
215 print STDERR "RANDOM InteractionFACTORS FED TO GENERATE MODEL SOMMER: ".Dumper($random_factors_interaction);
217 my $error;
219 ## generate the fixed factor formula
221 my $mmer_fixed_factors = "";
222 my $mmer_random_factors = "";
223 my $mmer_fixed_factors_interaction = "";
225 if (scalar(@$dependent_variables) > 1) { die "Works only with one trait for now! :-("; }
226 if (scalar(@$dependent_variables) > 0) {
227 if (scalar(@$fixed_factors) == 0) { $mmer_fixed_factors = "1"; }
228 else { $mmer_fixed_factors = join(" + ", @$fixed_factors); }
230 print STDERR "DEPENDENT VARIABLES: ".Dumper($dependent_variables);
232 $mmer_fixed_factors = make_R_variable_name($dependent_variables->[0]) ." ~ ". $mmer_fixed_factors;
234 if (scalar(@$random_factors)== 0) {$mmer_random_factors = "1"; }
235 else { $mmer_random_factors = join("+", @$random_factors);}
237 if (scalar(@$fixed_factors_interaction)== 0) {$mmer_fixed_factors_interaction = ""; }
239 else {
241 foreach my $interaction(@$fixed_factors_interaction){
244 if (scalar(@$interaction) != 2) { $error = "interaction needs to be pairs :-(";}
245 #if (scalar(@$random_factors_interaction)== 1) { $error .= "Works only with one interaction for now! :-(";}
247 else { $mmer_fixed_factors_interaction .= " + ". join(":", @$interaction);}
251 $mmer_random_factors = " ~ ".$mmer_random_factors ." ".$mmer_fixed_factors_interaction;
253 #location:genotype
255 print STDERR "mmer_fixed_factors = $mmer_fixed_factors\n";
256 print STDERR "mmer_random_factors = $mmer_random_factors\n";
258 #my $data = { fixed_factors => $mmer_fixed_factors,
259 # random_factors => $mmer_random_factors,
262 my $model = [ $mmer_fixed_factors, $mmer_random_factors ];
264 print STDERR "Data returned from generate_model_sommer: ".Dumper($model);
266 return ($model, $error);
271 =head2 run_model()
273 runs the model along with the data provided in the phenotyping file.
275 Produces two results files. If germplasm was a random factor, is will produce files
276 with the tempfiles stem and the extensions .adjustedBLUPs and .BLUPs, if it was a
277 fixed factor, the extension are .adjustedBLUEs and .BLUEs. The files have the
278 accession identifiers in the first column, with the remaining columns being the
279 adjusted BLUEs, BLUEs, adjusted BLUPs or BLUPs for all the initially selected
280 traits.
282 =cut
284 sub run_model {
285 my $self = shift;
286 my $backend = shift || 'Slurm';
287 my $cluster_host = shift || "localhost";
288 my $cluster_shared_tempdir = shift;
290 my $random_factors = '"'.join('","', @{$self->random_factors()}).'"';
291 my $fixed_factors = '"'.join('","',@{$self->fixed_factors()}).'"';
292 my $dependent_variables = '"'.join('","',@{$self->dependent_variables()}).'"';
294 my $model;
295 my $error;
296 my $executable;
297 if ($self->engine() eq "lme4") {
298 ($model, $error) = $self->generate_model();
299 $executable = " R/mixed_models.R ";
302 elsif ($self->engine() eq "sommer") {
303 ($model, $error) = $self->generate_model_sommer();
304 $executable = " R/mixed_models_sommer.R ";
307 my $dependent_variables_R = make_R_variable_name($dependent_variables);
309 # generate params_file
311 my $param_file = $self->tempfile().".params";
312 open(my $F, ">", $param_file) || die "Can't open $param_file for writing.";
313 print $F "dependent_variables <- c($dependent_variables_R)\n";
314 print $F "random_factors <- c($random_factors)\n";
315 print $F "fixed_factors <- c($fixed_factors)\n";
317 if ($self->engine() eq "lme4") {
318 print $F "model <- \"$model\"\n";
320 elsif ($self->engine() eq "sommer") {
321 print $F "fixed_model <- \"$model->[0]\"\n";
322 print $F "random_model <- \"$model->[1]\"\n";
324 close($F);
326 # clean phenotype file so that trait names are R compatible
328 my $clean_tempfile = $self->clean_file($self->tempfile());
330 # run r script to create model
332 my $cmd = "R CMD BATCH '--args datafile=\"".$clean_tempfile."\" paramfile=\"".$self->tempfile().".params\"' $executable ". $self->tempfile().".out";
333 print STDERR "running R command $cmd...\n";
335 print STDERR "running R command $clean_tempfile...\n";
337 my $ctr = CXGN::Tools::Run->new( { backend => $backend, working_dir => dirname($self->tempfile()), submit_host => $cluster_host } );
340 $ctr->run_cluster($cmd);
342 while ($ctr->alive()) {
343 sleep(1);
347 =head2 make_R_variable_name
349 Usage:
350 Desc:
351 Ret:
352 Args:
353 Side Effects:
354 Example:
356 =cut
358 sub make_R_variable_name {
359 my $name = shift;
360 $name =~ s/\s/\_/g;
361 $name =~ s/\//\_/g;
362 $name =~ tr/ /./;
363 $name =~ tr/\//./;
364 $name =~ s/\:/\_/g;
365 $name =~ s/\|/\_/g;
366 $name =~ s/\-/\_/g;
368 return $name;
371 sub clean_file {
372 my $self = shift;
373 my $file = shift;
375 open(my $PF, "<", $file) || die "Can't open pheno file ".$file."_phenotype.txt";
376 open(my $CLEAN, ">", $file.".clean") || die "Can't open ".$file.".clean for writing";
378 my $header = <$PF>;
379 chomp($header);
381 my @fields = split /\t/, $header;
383 my @file_traits = @fields[ 39 .. @fields-1 ];
384 my @other_headers = @fields[ 0 .. 38 ];
386 print STDERR "FIELDS: ".Dumper(\@file_traits);
388 foreach my $t (@file_traits) {
389 $t = make_R_variable_name($t);
392 print STDERR "FILE TRAITS: ".Dumper(\@file_traits);
394 my @new_header = (@other_headers, @file_traits);
395 print $CLEAN join("\t", @new_header)."\n";
397 while(<$PF>) {
398 print $CLEAN $_;
401 move($file, $file.".before_clean");
402 move($file.".clean", $file);
404 return $file;