start adding sommer support.
[sgn.git] / lib / CXGN / MixedModels.pm
blobb40ac32de41af7522d20e496b412ac6329285842
2 =head1 NAME
4 CXGN::MixedModels - a package to run user-specified mixed models
6 =head1 DESCRIPTION
8 my $mm = CXGN::MixedModels->new();
9 my $mm->phenotype_file("t/data/phenotype_data.csv");
10 my $mm->dependent_variables(qw | |);
11 my $mm->fixed_factors( qw | | );
12 my $mm->random_factors( qw| | );
13 my $mm->traits( qw| | );
15 =head1 AUTHOR
17 Lukas Mueller <lam87@cornell.edu>
19 =head1 METHODS
21 =cut
23 package CXGN::MixedModels;
25 use Moose;
26 use Data::Dumper;
27 use File::Slurp qw| slurp |;
28 use File::Basename;
29 use CXGN::Tools::Run;
30 use CXGN::Phenotypes::File;
32 =head2 dependent_variables()
34 sets the dependent variables (a listref of traits)
36 =cut
38 has 'dependent_variables' => (is => 'rw', isa => 'ArrayRef[Str]|Undef');
40 =head2 fixed_factors()
42 sets the fixed factors (listref)
44 =cut
46 has 'fixed_factors' => (is => 'rw', isa => 'Ref', default => sub {[]});
48 =head2 fixed_factors_interaction()
50 sets the fixed factors with interaction (listref)
52 =cut
54 has 'fixed_factors_interaction' => (is => 'rw', isa => 'Ref', default => sub{[]});
56 =head2 variable_slope_factors()
58 =cut
60 has 'variable_slope_factors' => (is => 'rw', isa => 'Ref', default => sub{[]});
62 =head2 random_factors()
64 =cut
66 has 'random_factors' => (is => 'rw', isa => 'Ref', default => sub {[]});
68 =head2 variable_slop_intersects
70 =cut
72 has 'variable_slope_intersects' => (is => 'rw', isa => 'Ref', default => sub {[]});
74 =head2 traits()
76 sets the traits
78 =cut
80 has 'traits' => (is => 'rw', isa => 'Ref');
82 =head2 levels()
86 =cut
88 has 'levels' => (is => 'rw', isa => 'HashRef' );
90 has 'phenotype_file' => (is => 'rw', isa => 'CXGN::Phenotypes::File|Undef');
92 =head2 tempfile()
94 the tempfile that contains the phenotypic information.
96 =cut
98 has 'tempfile' => (is => 'rw', isa => 'Str|Undef');
100 sub BUILD {
101 my $self = shift;
103 my $phenotype_file;
105 if ($self->tempfile()) {
106 $phenotype_file = CXGN::Phenotypes::File->new( { file => $self->tempfile() } );
108 $self->phenotype_file($phenotype_file);
112 =head2 generate_models()
114 generates the model string, in lme4 format, from the current parameters
116 =cut
118 sub generate_model {
119 my $self = shift;
121 my $tempfile = $self->tempfile();
122 my $dependent_variables = $self->dependent_variables();
123 my $fixed_factors = $self->fixed_factors();
124 my $fixed_factors_interaction = $self->fixed_factors_interaction();
125 my $variable_slope_intersects = $self->variable_slope_intersects();
126 my $random_factors = $self->random_factors();
128 my $error;
130 my @addends = ();
132 print STDERR join("\n", ("DV", Dumper($dependent_variables), "FF", Dumper($fixed_factors), "RF", Dumper($random_factors), "TF", $tempfile, "FFI", Dumper($fixed_factors_interaction), "VSI: ", Dumper($variable_slope_intersects)));
134 print STDERR Dumper($fixed_factors);
135 my $model = "";
137 if (! $dependent_variables || scalar(@$dependent_variables)==0) {
138 die "Need a dependent variable(s) set in CXGN::MixedModels... Ciao!";
141 my $formatted_fixed_factors = "";
142 if (@$fixed_factors) {
143 $formatted_fixed_factors = join(" + ", @$fixed_factors);
144 push @addends, $formatted_fixed_factors;
147 my $formatted_fixed_factors_interaction = "";
148 foreach my $interaction (@$fixed_factors_interaction) {
149 my $terms = "";
150 if (ref($interaction)) {
151 $terms = join("*", @$interaction);
152 push @addends, $terms;
154 else {
155 $error = "Interaction terms are not correctly defined.";
159 my $formatted_variable_slope_intersects = "";
160 foreach my $variable_slope_groups (@$variable_slope_intersects) {
161 if (exists($variable_slope_groups->[0]) && exists($variable_slope_groups->[1])) {
162 my $term = " (1+$variable_slope_groups->[0] \| $variable_slope_groups->[1]) ";
163 print STDERR "TERM: $term\n";
164 $formatted_variable_slope_intersects .= $term;
165 push @addends, $formatted_variable_slope_intersects;
170 my $formatted_random_factors = "";
171 # if ($random_factors_random_slope) {
172 # $formatted_random_factors = " (1 + $random_factors->[0] | $random_factors->[1]) ";
175 # else {
176 if (@$random_factors) {
177 $formatted_random_factors = join(" + ", map { "(1|$_)" } @$random_factors);
178 push @addends, $formatted_random_factors;
182 $model .= join(" + ", @addends);
184 return $model;
187 sub generate_model_sommer {
188 my $self = shift;
190 my $tempfile = $self->tempfile();
191 my $dependent_variables = $self->dependent_variables();
192 my $fixed_factors = $self->fixed_factors();
193 my $fixed_factors_interaction = $self->fixed_factors_interaction();
194 my $variable_slope_intersects = $self->variable_slope_intersects();
195 my $random_factors = $self->random_factors();
197 my $error;
199 ## generate the fixed factor formula
201 my $mmer_fixed_factors = "";
203 if (scalar(@$dependent_variables) > 1) { die "Works only with one trait for now! :-("; }
204 if (scalar(@$dependent_variables) > 0) {
205 if (scalar(@$fixed_factors) == 0) { $mmer_fixed_factors = "1"; }
206 else { $mmer_fixed_factors = join(" + ", @$fixed_factors); }
208 $mmer_fixed_factors = $dependent_variables->[0] ." ~ ". $mmer_fixed_factors;
211 print STDERR "mmer_fixed_factors = $mmer_fixed_factors\n";
213 return $mmer_fixed_factors;
217 =head2 run_model()
219 runs the model along with the data provided in the phenotyping file.
221 Produces two results files. If germplasm was a random factor, is will produce files
222 with the tempfiles stem and the extensions .adjustedBLUPs and .BLUPs, if it was a
223 fixed factor, the extension are .adjustedBLUEs and .BLUEs. The files have the
224 accession identifiers in the first column, with the remaining columns being the
225 adjusted BLUEs, BLUEs, adjusted BLUPs or BLUPs for all the initially selected
226 traits.
228 =cut
230 sub run_model {
231 my $self = shift;
232 my $backend = shift || 'Slurm';
233 my $cluster_host = shift || "localhost";
235 my $random_factors = '"'.join('","', @{$self->random_factors()}).'"';
236 my $fixed_factors = '"'.join('","',@{$self->fixed_factors()}).'"';
237 my $dependent_variables = '"'.join('","',@{$self->dependent_variables()}).'"';
239 my $model = $self->generate_model();
241 # generate params_file
243 my $param_file = $self->tempfile().".params";
244 open(my $F, ">", $param_file) || die "Can't open $param_file for writing.";
245 print $F "dependent_variables <- c($dependent_variables)\n";
246 print $F "random_factors <- c($random_factors)\n";
247 print $F "fixed_factors <- c($fixed_factors)\n";
249 print $F "model <- \"$model\"\n";
250 close($F);
252 # run r script to create model
254 my $cmd = "R CMD BATCH '--args datafile=\"".$self->tempfile()."\" paramfile=\"".$self->tempfile().".params\"' " . " R/mixed_models.R ".$self->tempfile().".out";
255 print STDERR "running R command $cmd...\n";
257 print STDERR "running R command ".$self->tempfile()."...\n";
259 my $ctr = CXGN::Tools::Run->new( { backend => $backend, working_dir => dirname($self->tempfile()), backend => $backend, submit_host => $cluster_host } );
261 $ctr->run_cluster($cmd);
263 while ($ctr->alive()) {
264 sleep(1);