4 CXGN::MixedModels - a package to run user-specified mixed models
8 my $mm = CXGN::MixedModels->new();
9 my $mm->phenotype_file("t/data/phenotype_data.csv");
10 my $mm->dependent_variables(qw | |);
11 my $mm->fixed_factors( qw | | );
12 my $mm->random_factors( qw| | );
13 my $mm->traits( qw| | );
17 Lukas Mueller <lam87@cornell.edu>
23 package CXGN
::MixedModels
;
27 use File
::Slurp qw
| slurp
|;
31 use CXGN
::Phenotypes
::File
;
33 =head2 dependent_variables()
35 sets the dependent variables (a listref of traits)
39 has
'dependent_variables' => (is
=> 'rw', isa
=> 'ArrayRef[Str]|Undef');
41 =head2 fixed_factors()
43 sets the fixed factors (listref)
47 has
'fixed_factors' => (is
=> 'rw', isa
=> 'Ref', default => sub {[]});
49 =head2 fixed_factors_interaction()
51 sets the fixed factors with interaction (listref)
55 has
'fixed_factors_interaction' => (is
=> 'rw', isa
=> 'Ref', default => sub{[]});
57 =head2 random_factors_interaction()
59 sets the random factors with interaction (listref)
63 has
'random_factors_interaction' => (is
=> 'rw', isa
=> 'Ref', default => sub{[]});
65 =head2 variable_slope_factors()
69 has
'variable_slope_factors' => (is
=> 'rw', isa
=> 'Ref', default => sub{[]});
71 =head2 random_factors()
75 has
'random_factors' => (is
=> 'rw', isa
=> 'Ref', default => sub {[]});
77 =head2 variable_slop_intersects
81 has
'variable_slope_intersects' => (is
=> 'rw', isa
=> 'Ref', default => sub {[]});
89 has
'traits' => (is
=> 'rw', isa
=> 'Ref');
96 sets the engine. Either sommer or lme4. Default: lme4.
100 has
'engine' => (is
=> 'rw', isa
=> 'Maybe[Str]', default => 'lme4' );
102 has
'levels' => (is
=> 'rw', isa
=> 'HashRef' );
104 has
'phenotype_file' => (is
=> 'rw', isa
=> 'CXGN::Phenotypes::File|Undef');
108 the tempfile that contains the phenotypic information.
112 has
'tempfile' => (is
=> 'rw', isa
=> 'Str|Undef');
119 if ($self->tempfile()) {
120 $phenotype_file = CXGN
::Phenotypes
::File
->new( { file
=> $self->tempfile() } );
122 $self->phenotype_file($phenotype_file);
126 =head2 generate_model()
128 generates the model string, in lme4 format, from the current parameters
135 my $tempfile = $self->tempfile();
136 my $dependent_variables = $self->dependent_variables();
137 my $fixed_factors = $self->fixed_factors();
138 my $fixed_factors_interaction = $self->fixed_factors_interaction();
139 my $variable_slope_intersects = $self->variable_slope_intersects();
140 my $random_factors = $self->random_factors();
146 print STDERR
join("\n", ("DV", Dumper
($dependent_variables), "FF", Dumper
($fixed_factors), "RF", Dumper
($random_factors), "TF", $tempfile, "FFI", Dumper
($fixed_factors_interaction), "VSI: ", Dumper
($variable_slope_intersects)));
148 print STDERR Dumper
($fixed_factors);
151 if (! $dependent_variables || scalar(@
$dependent_variables)==0) {
152 die "Need a dependent variable(s) set in CXGN::MixedModels... Ciao!";
155 my $formatted_fixed_factors = "";
156 if (@
$fixed_factors) {
157 $formatted_fixed_factors = join(" + ", @
$fixed_factors);
158 push @addends, $formatted_fixed_factors;
161 my $formatted_fixed_factors_interaction = "";
162 foreach my $interaction (@
$fixed_factors_interaction) {
164 if (ref($interaction)) {
165 $terms = join("*", @
$interaction);
166 push @addends, $terms;
169 $error = "Interaction terms are not correctly defined.";
173 my $formatted_variable_slope_intersects = "";
174 foreach my $variable_slope_groups (@
$variable_slope_intersects) {
175 if (exists($variable_slope_groups->[0]) && exists($variable_slope_groups->[1])) {
176 my $term = " (1+$variable_slope_groups->[0] \| $variable_slope_groups->[1]) ";
177 print STDERR
"TERM: $term\n";
178 $formatted_variable_slope_intersects .= $term;
179 push @addends, $formatted_variable_slope_intersects;
184 my $formatted_random_factors = "";
185 # if ($random_factors_random_slope) {
186 # $formatted_random_factors = " (1 + $random_factors->[0] | $random_factors->[1]) ";
190 if (@
$random_factors) {
191 $formatted_random_factors = join(" + ", map { "(1|$_)" } @
$random_factors);
192 push @addends, $formatted_random_factors;
196 $model .= join(" + ", @addends);
198 return ($model, $error);
201 sub generate_model_sommer
{
204 my $tempfile = $self->tempfile();
205 my @dependent_variables_cleaned = map { make_R_variable_name
($_) } @
{$self->dependent_variables()};
206 my $dependent_variables = \
@dependent_variables_cleaned;
207 my $fixed_factors = $self->fixed_factors();
208 my $fixed_factors_interaction = $self->fixed_factors_interaction();
209 my $random_factors_interaction = $self->random_factors_interaction();
210 my $variable_slope_intersects = $self->variable_slope_intersects();
211 my $random_factors = $self->random_factors();
213 print STDERR
"FIXED FACTORS FED TO GENERATE MODEL SOMMER: ".Dumper
($fixed_factors);
214 print STDERR
"FIXED InteractionFACTORS FED TO GENERATE MODEL SOMMER: ".Dumper
($fixed_factors_interaction);
215 print STDERR
"RANDOM InteractionFACTORS FED TO GENERATE MODEL SOMMER: ".Dumper
($random_factors_interaction);
219 ## generate the fixed factor formula
221 my $mmer_fixed_factors = "";
222 my $mmer_random_factors = "";
223 my $mmer_fixed_factors_interaction = "";
225 if (scalar(@
$dependent_variables) > 1) { die "Works only with one trait for now! :-("; }
226 if (scalar(@
$dependent_variables) > 0) {
227 if (scalar(@
$fixed_factors) == 0) { $mmer_fixed_factors = "1"; }
228 else { $mmer_fixed_factors = join(" + ", @
$fixed_factors); }
230 print STDERR
"DEPENDENT VARIABLES: ".Dumper
($dependent_variables);
232 $mmer_fixed_factors = make_R_variable_name
($dependent_variables->[0]) ." ~ ". $mmer_fixed_factors;
234 if (scalar(@
$random_factors)== 0) {$mmer_random_factors = "1"; }
235 else { $mmer_random_factors = join("+", @
$random_factors);}
237 if (scalar(@
$fixed_factors_interaction)== 0) {$mmer_fixed_factors_interaction = ""; }
241 foreach my $interaction(@
$fixed_factors_interaction){
244 if (scalar(@
$interaction) != 2) { $error = "interaction needs to be pairs :-(";}
245 #if (scalar(@$random_factors_interaction)== 1) { $error .= "Works only with one interaction for now! :-(";}
247 else { $mmer_fixed_factors_interaction .= " + ". join(":", @
$interaction);}
251 $mmer_random_factors = " ~ ".$mmer_random_factors ." ".$mmer_fixed_factors_interaction;
255 print STDERR
"mmer_fixed_factors = $mmer_fixed_factors\n";
256 print STDERR
"mmer_random_factors = $mmer_random_factors\n";
258 #my $data = { fixed_factors => $mmer_fixed_factors,
259 # random_factors => $mmer_random_factors,
262 my $model = [ $mmer_fixed_factors, $mmer_random_factors ];
264 print STDERR
"Data returned from generate_model_sommer: ".Dumper
($model);
266 return ($model, $error);
273 runs the model along with the data provided in the phenotyping file.
275 Produces two results files. If germplasm was a random factor, is will produce files
276 with the tempfiles stem and the extensions .adjustedBLUPs and .BLUPs, if it was a
277 fixed factor, the extension are .adjustedBLUEs and .BLUEs. The files have the
278 accession identifiers in the first column, with the remaining columns being the
279 adjusted BLUEs, BLUEs, adjusted BLUPs or BLUPs for all the initially selected
286 my $backend = shift || 'Slurm';
287 my $cluster_host = shift || "localhost";
288 my $cluster_shared_tempdir = shift;
290 my $random_factors = '"'.join('","', @
{$self->random_factors()}).'"';
291 my $fixed_factors = '"'.join('","',@
{$self->fixed_factors()}).'"';
292 my $dependent_variables = '"'.join('","',@
{$self->dependent_variables()}).'"';
297 if ($self->engine() eq "lme4") {
298 ($model, $error) = $self->generate_model();
299 $executable = " R/mixed_models.R ";
302 elsif ($self->engine() eq "sommer") {
303 ($model, $error) = $self->generate_model_sommer();
304 $executable = " R/mixed_models_sommer.R ";
307 my $dependent_variables_R = make_R_variable_name
($dependent_variables);
309 # generate params_file
311 my $param_file = $self->tempfile().".params";
312 open(my $F, ">", $param_file) || die "Can't open $param_file for writing.";
313 print $F "dependent_variables <- c($dependent_variables_R)\n";
314 print $F "random_factors <- c($random_factors)\n";
315 print $F "fixed_factors <- c($fixed_factors)\n";
317 if ($self->engine() eq "lme4") {
318 print $F "model <- \"$model\"\n";
320 elsif ($self->engine() eq "sommer") {
321 print $F "fixed_model <- \"$model->[0]\"\n";
322 print $F "random_model <- \"$model->[1]\"\n";
326 # clean phenotype file so that trait names are R compatible
328 my $clean_tempfile = $self->clean_file($self->tempfile());
330 # run r script to create model
332 my $cmd = "R CMD BATCH '--args datafile=\"".$clean_tempfile."\" paramfile=\"".$self->tempfile().".params\"' $executable ". $self->tempfile().".out";
333 print STDERR
"running R command $cmd...\n";
335 print STDERR
"running R command $clean_tempfile...\n";
337 my $ctr = CXGN
::Tools
::Run
->new( { backend
=> $backend, working_dir
=> dirname
($self->tempfile()), submit_host
=> $cluster_host } );
340 $ctr->run_cluster($cmd);
342 while ($ctr->alive()) {
347 =head2 make_R_variable_name
358 sub make_R_variable_name
{
375 open(my $PF, "<", $file) || die "Can't open pheno file ".$file."_phenotype.txt";
376 open(my $CLEAN, ">", $file.".clean") || die "Can't open ".$file.".clean for writing";
381 my @fields = split /\t/, $header;
383 my @file_traits = @fields[ 39 .. @fields-1 ];
384 my @other_headers = @fields[ 0 .. 38 ];
386 print STDERR
"FIELDS: ".Dumper
(\
@file_traits);
388 foreach my $t (@file_traits) {
389 $t = make_R_variable_name
($t);
392 print STDERR
"FILE TRAITS: ".Dumper
(\
@file_traits);
394 my @new_header = (@other_headers, @file_traits);
395 print $CLEAN join("\t", @new_header)."\n";
401 move
($file, $file.".before_clean");
402 move
($file.".clean", $file);