4 CXGN::MixedModels - a package to run user-specified mixed models
8 my $mm = CXGN::MixedModels->new();
9 my $mm->phenotype_file("t/data/phenotype_data.csv");
10 my $mm->dependent_variables(qw | |);
11 my $mm->fixed_factors( qw | | );
12 my $mm->random_factors( qw| | );
13 my $mm->traits( qw| | );
17 Lukas Mueller <lam87@cornell.edu>
23 package CXGN
::MixedModels
;
30 use CXGN
::Phenotypes
::File
;
32 =head2 dependent_variables()
34 sets the dependent variables (a listref of traits)
38 has
'dependent_variables' => (is
=> 'rw', isa
=> 'ArrayRef[Str]|Undef');
40 =head2 fixed_factors()
42 sets the fixed factors (listref)
46 has
'fixed_factors' => (is
=> 'rw', isa
=> 'Ref', default => sub {[]});
48 =head2 fixed_factors_interaction()
50 sets the fixed factors with interaction (listref)
54 has
'fixed_factors_interaction' => (is
=> 'rw', isa
=> 'Ref', default => sub{[]});
56 =head2 random_factors_interaction()
58 sets the random factors with interaction (listref)
62 has
'random_factors_interaction' => (is
=> 'rw', isa
=> 'Ref', default => sub{[]});
64 =head2 variable_slope_factors()
68 has
'variable_slope_factors' => (is
=> 'rw', isa
=> 'Ref', default => sub{[]});
70 =head2 random_factors()
74 has
'random_factors' => (is
=> 'rw', isa
=> 'Ref', default => sub {[]});
76 =head2 variable_slop_intersects
80 has
'variable_slope_intersects' => (is
=> 'rw', isa
=> 'Ref', default => sub {[]});
88 has
'traits' => (is
=> 'rw', isa
=> 'Ref');
95 sets the engine. Either sommer or lme4. Default: lme4.
99 has
'engine' => (is
=> 'rw', isa
=> 'Maybe[Str]', default => 'lme4' );
101 has
'levels' => (is
=> 'rw', isa
=> 'HashRef' );
103 has
'phenotype_file' => (is
=> 'rw', isa
=> 'CXGN::Phenotypes::File|Undef');
107 the tempfile that contains the phenotypic information.
111 has
'tempfile' => (is
=> 'rw', isa
=> 'Str|Undef');
118 if ($self->tempfile()) {
119 $phenotype_file = CXGN
::Phenotypes
::File
->new( { file
=> $self->tempfile() } );
121 $self->phenotype_file($phenotype_file);
125 =head2 generate_model()
127 generates the model string, in lme4 format, from the current parameters
134 my $tempfile = $self->tempfile();
135 my $dependent_variables = $self->dependent_variables();
136 my $fixed_factors = $self->fixed_factors();
137 my $fixed_factors_interaction = $self->fixed_factors_interaction();
138 my $variable_slope_intersects = $self->variable_slope_intersects();
139 my $random_factors = $self->random_factors();
145 print STDERR
join("\n", ("DV", Dumper
($dependent_variables), "FF", Dumper
($fixed_factors), "RF", Dumper
($random_factors), "TF", $tempfile, "FFI", Dumper
($fixed_factors_interaction), "VSI: ", Dumper
($variable_slope_intersects)));
147 print STDERR Dumper
($fixed_factors);
150 if (! $dependent_variables || scalar(@
$dependent_variables)==0) {
151 die "Need a dependent variable(s) set in CXGN::MixedModels... Ciao!";
154 my $formatted_fixed_factors = "";
155 if (@
$fixed_factors) {
156 $formatted_fixed_factors = join(" + ", @
$fixed_factors);
157 push @addends, $formatted_fixed_factors;
160 my $formatted_fixed_factors_interaction = "";
161 foreach my $interaction (@
$fixed_factors_interaction) {
163 if (ref($interaction)) {
164 $terms = join("*", @
$interaction);
165 push @addends, $terms;
168 $error = "Interaction terms are not correctly defined.";
172 my $formatted_variable_slope_intersects = "";
173 foreach my $variable_slope_groups (@
$variable_slope_intersects) {
174 if (exists($variable_slope_groups->[0]) && exists($variable_slope_groups->[1])) {
175 my $term = " (1+$variable_slope_groups->[0] \| $variable_slope_groups->[1]) ";
176 print STDERR
"TERM: $term\n";
177 $formatted_variable_slope_intersects .= $term;
178 push @addends, $formatted_variable_slope_intersects;
183 my $formatted_random_factors = "";
184 # if ($random_factors_random_slope) {
185 # $formatted_random_factors = " (1 + $random_factors->[0] | $random_factors->[1]) ";
189 foreach my $rf (@
$random_factors) {
191 $formatted_random_factors .= "(1|$rf)" ;
192 print STDERR
" formatted random factor now $formatted_random_factors\n";
193 push @addends, $formatted_random_factors;
198 $model .= join(" + ", @addends);
200 return ($model, $error);
203 sub generate_model_sommer
{
206 my $tempfile = $self->tempfile();
207 my @dependent_variables_cleaned = map { make_R_variable_name
($_) } @
{$self->dependent_variables()};
208 my $dependent_variables = \
@dependent_variables_cleaned;
209 my $fixed_factors = $self->fixed_factors();
210 my $fixed_factors_interaction = $self->fixed_factors_interaction();
211 my $random_factors_interaction = $self->random_factors_interaction();
212 my $variable_slope_intersects = $self->variable_slope_intersects();
213 my $random_factors = $self->random_factors();
215 print STDERR
"FIXED FACTORS FED TO GENERATE MODEL SOMMER: ".Dumper
($fixed_factors);
216 print STDERR
"FIXED Interaction FACTORS FED TO GENERATE MODEL SOMMER: ".Dumper
($fixed_factors_interaction);
217 print STDERR
"RANDOM Interaction FACTORS FED TO GENERATE MODEL SOMMER: ".Dumper
($random_factors_interaction);
221 ## generate the fixed factor formula
223 my $mmer_fixed_factors = "";
224 my $mmer_random_factors = "";
225 my $mmer_fixed_factors_interaction = "";
226 my $mmer_variable_slope_intersects ="";
228 if (scalar(@
$dependent_variables) > 1) { return ("", "For Sommer, only one trait can be analyzed at one time. Please go back and select only one trait or select lme4.") }
230 if (scalar(@
$dependent_variables) > 0) {
231 print STDERR
"preparing fixed factors...\n";
232 if (scalar(@
$fixed_factors) == 0) { $mmer_fixed_factors = "1"; }
233 else { $mmer_fixed_factors = join(" + ", @
$fixed_factors); }
235 print STDERR
"DEPENDENT VARIABLES: ".Dumper
($dependent_variables);
237 $mmer_fixed_factors = make_R_variable_name
($dependent_variables->[0]) ." ~ ". $mmer_fixed_factors;
239 if (scalar(@
$random_factors)== 0) {$mmer_random_factors = 1; }
242 print STDERR
"Preparing random factors...\n";
243 $mmer_random_factors = join("+", @
$random_factors);
246 if (scalar(@
$fixed_factors_interaction)== 0) {$mmer_fixed_factors_interaction = ""; }
250 foreach my $interaction(@
$fixed_factors_interaction){
253 if (scalar(@
$interaction) != 2) { $error = "interaction needs to be pairs :-(";}
256 else { $mmer_fixed_factors_interaction .= " + ". join(":", @
$interaction);}
262 # if (scalar(@$variable_slope_intersects)== 0) {$mmer_variable_slope_intersects = ""; }
266 # foreach my $intersects(@$variable_slope_intersects){
269 # if (scalar(@$intersects) != 2) { $error = "intersects needs to be pairs :-(";}
270 # #if (scalar(@$random_factors_interaction)== 1) { $error .= "Works only with one interaction for now! :-(";}
272 # else { $mmer_variable_slope_intersects .= " + vsr(". join(",", @$intersects) . ")";} # vsr(Days, Subject)
278 # $mmer_random_factors = " ~ ".$mmer_random_factors ." ".$mmer_fixed_factors_interaction." ".$mmer_variable_slope_intersects;
283 if (scalar(@
$variable_slope_intersects)== 0) {$mmer_variable_slope_intersects = ""; }
287 foreach my $intersects(@
$variable_slope_intersects){
290 if (scalar(@
$intersects) != 2) { $error = "intersects needs to be pairs :-(";}
291 #if (scalar(@$random_factors_interaction)== 1) { $error .= "Works only with one interaction for now! :-(";}
293 else { $mmer_variable_slope_intersects .= " + vsr(". join(",", @
$intersects) . ")";} # vsr(Days, Subject)
297 if ($mmer_random_factors){
298 $formula = " ~ ".$mmer_random_factors ;
300 if ($mmer_fixed_factors_interaction) {
301 $formula.=" ".$mmer_fixed_factors_interaction;
303 if ($mmer_variable_slope_intersects) {
304 $formula.=" ".$mmer_variable_slope_intersects;
310 print STDERR
"mmer_fixed_factors = $mmer_fixed_factors\n";
311 print STDERR
"mmer_random_factors = $formula\n";
313 #my $data = { fixed_factors => $mmer_fixed_factors,
314 # random_factors => $mmer_random_factors,
317 my $model = [ $mmer_fixed_factors, $formula ];
319 print STDERR
"Data returned from generate_model_sommer: ".Dumper
($model);
321 return ($model, $error);
332 runs the model along with the data provided in the phenotyping file.
334 Produces two results files. If germplasm was a random factor, is will produce files
335 with the tempfiles stem and the extensions .adjustedBLUPs and .BLUPs, if it was a
336 fixed factor, the extension are .adjustedBLUEs and .BLUEs. The files have the
337 accession identifiers in the first column, with the remaining columns being the
338 adjusted BLUEs, BLUEs, adjusted BLUPs or BLUPs for all the initially selected
341 The files from the dataset will contain trait names that are incompatible with R. These names are converted using the clean_file subroutine. The conversion takes place and is saved in a file with a .clean extension. Then, the .clean file is moved to the previous file name, such that there is not difference in file naming.
344 The result files will initially contain these R-based names as well. The conversion between R and dataset names are stored in a file with the extension .traits . The inital result files will be converted back to dataset names using the sub convert_file_headers_back_to_breedbase_traits() function. The conversion is saved in a file with the .original_traits extension, which is then moved back to the original file name once the conversion is complete.
352 my $backend = shift || 'Slurm';
353 my $cluster_host = shift || "localhost";
354 my $cluster_shared_tempdir = shift;
356 my $random_factors = '"'.join('","', @
{$self->random_factors()}).'"';
357 my $fixed_factors = '"'.join('","',@
{$self->fixed_factors()}).'"';
358 my $dependent_variables = '"'.join('","',@
{$self->dependent_variables()}).'"';
366 if ($self->engine() eq "lme4") {
367 ($model, $error) = $self->generate_model();
368 $executable = " R/mixed_models.R ";
371 elsif ($self->engine() eq "sommer") {
372 ($model, $error) = $self->generate_model_sommer();
373 $executable = " R/mixed_models_sommer.R ";
376 if ($error) { die "$error"; }
378 my $dependent_variables_R = make_R_variable_name
($dependent_variables);
380 # generate params_file
382 my $param_file = $self->tempfile().".params";
383 open(my $F, ">", $param_file) || die "Can't open $param_file for writing.";
384 print $F "dependent_variables <- c($dependent_variables_R)\n";
385 print $F "random_factors <- c($random_factors)\n";
386 print $F "fixed_factors <- c($fixed_factors)\n";
388 if ($self->engine() eq "lme4") {
389 print $F "model <- \"$model\"\n";
391 elsif ($self->engine() eq "sommer") {
392 print $F "fixed_model <- \"$model->[0]\"\n";
393 print $F "random_model <- \"$model->[1]\"\n";
397 # clean phenotype file so that trait names are R compatible
399 my $clean_tempfile = $self->clean_file($self->tempfile());
401 # run r script to create model
403 my $cmd = "R CMD BATCH '--args datafile=\"".$clean_tempfile."\" paramfile=\"".$self->tempfile().".params\"' $executable ". $self->tempfile().".out";
404 print STDERR
"running R command $cmd...\n";
406 print STDERR
"running R command $clean_tempfile...\n";
408 my $ctr = CXGN
::Tools
::Run
->new( { backend
=> $backend, working_dir
=> dirname
($self->tempfile()), submit_host
=> $cluster_host } );
411 $ctr->run_cluster($cmd);
413 while ($ctr->alive()) {
417 # replace the R-compatible traits with original trait names
419 print STDERR
"Converting files back to non-R headers...\n";
421 $self->tempfile().".adjustedBLUPs",
422 $self->tempfile().".BLUPs",
423 $self->tempfile().".BLUEs",
424 $self->tempfile().".adjustedBLUEs",
425 $self->tempfile().".anova",
426 $self->tempfile().".varcomp",
429 my $conversion_matrix = $self->read_conversion_matrix($self->tempfile().".traits");
432 $self->convert_file_headers_back_to_breedbase_traits($f, $conversion_matrix);
435 print STDERR
"File $f does not exist, not converting. This may be normal.\n";
447 =head2 make_R_variable_name
458 sub make_R_variable_name
{
475 open(my $PF, "<", $file) || die "Can't open pheno file ".$file."_phenotype.txt";
476 open(my $CLEAN, ">", $file.".clean") || die "Can't open ".$file.".clean for writing";
478 open(my $TRAITS, ">", $file.".traits") || die "Can't open ".$file.".traits for writing";
484 my @fields = split /\t/, $header;
486 my @file_traits = @fields[ 39 .. @fields-1 ];
487 my @other_headers = @fields[ 0 .. 38 ];
489 print STDERR
"FIELDS: ".Dumper
(\
@file_traits);
491 foreach my $t (@file_traits) {
492 my $R_t = make_R_variable_name
($t);
493 print $TRAITS "$R_t\t$t\n";
497 print STDERR
"FILE TRAITS: ".Dumper
(\
@file_traits);
499 my @new_header = (@other_headers, @file_traits);
500 print $CLEAN join("\t", @new_header)."\n";
507 print STDERR
"moving $file to $file.before_clean...\n";
508 move
($file, $file.".before_clean");
510 print STDERR
"moving $file.clean to $file...\n";
511 move
($file.".clean", $file);
517 sub convert_file_headers_back_to_breedbase_traits
{
520 my $conversion_matrix = shift;
522 open(my $F, "<", $file) || die "Can't open $file\n";
524 print STDERR
"Opening ".$self->tempfile().".original_traits for writing...\n";
525 open(my $G, ">", $file.".original_traits") || die "Can't open $file.original_traits";
530 my @fields = split /\t/, $header;
532 foreach my $f (@fields) {
533 if ($conversion_matrix->{$f}) {
534 print STDERR
"Converting $f to $conversion_matrix->{$f}...\n";
535 $f = $conversion_matrix->{$f};
540 print $G join("\t", @fields)."\n";
544 # replace NA or . with undef throughout the file
545 # (strings are not accepted by store phenotypes routine
546 # used in analysis storage).
548 my @fields = split /\t/;
549 foreach my $f (@fields) {
550 if ($f eq "NA" || $f eq '.') { $f = undef; }
552 my $line = join("\t", @fields);
557 print STDERR
"move file $file.original_traits back to $file...\n";
558 move
($file.".original_traits", $file);
561 sub read_conversion_matrix
{
565 my $conversion_file = $file;
567 open(my $F, "<", $conversion_file) || die "Can't open file $conversion_file";
569 my %conversion_matrix;
573 my ($new, $old) = split "\t";
574 $conversion_matrix{$new} = $old;
576 return \
%conversion_matrix;