modified key
[sgn.git] / lib / CXGN / Dataset / File.pm
blob3a6e5b6524d6082cd2c4f82ac575e74059b5edf7
2 package CXGN::Dataset::File;
4 use Moose;
5 use File::Slurp qw | write_file |;
6 use JSON::Any;
7 use Data::Dumper;
8 use CXGN::Genotype::Search;
10 extends 'CXGN::Dataset';
12 has 'file_name' => ( isa => 'Str',
13 is => 'rw',
14 default => '/tmp/dataset_file',
17 has 'quotes' => ( isa => 'Bool',
18 is => 'rw',
19 default => 1,
22 sub retrieve_genotypes_vcf {
23 my $self = shift;
24 my $protocol_id = shift;
25 my $file = shift; # || $self->file_name()."_genotype.txt";
26 my $cache_root_dir = shift;
27 my $cluster_shared_tempdir_config = shift;
28 my $backend_config = shift;
29 my $cluster_host_config = shift;
30 my $web_cluster_queue_config = shift;
31 my $basepath_config = shift;
32 my $forbid_cache = shift;
33 my $genotypeprop_hash_select = shift || ['DS'];
34 my $protocolprop_top_key_select = shift || [];
35 my $protocolprop_marker_hash_select = shift || [];
36 my $return_only_first_genotypeprop_for_stock = shift || 1;
38 my $accessions_list_ref = $self->retrieve_accessions();
40 my @accession_ids;
41 foreach my $ai (@$accessions_list_ref) {
42 push @accession_ids, $ai->[0];
45 my @protocol_ids;
47 if ($protocol_id) {
48 push @protocol_ids, $protocol_id;
50 else {
51 my $genotyping_protocol_ref = $self->retrieve_genotyping_protocols();
53 foreach my $p (@$genotyping_protocol_ref) {
54 push @protocol_ids, $p->[0];
58 if (! @protocol_ids) {
59 die "No protocol provided or no protocol associated with dataset\n";
62 print STDERR "PROTOCOL IDS: ".Dumper(\@protocol_ids);
64 my @accessions_list = @$accessions_list_ref;
65 my $genotypes_search = CXGN::Genotype::Search->new(
66 bcs_schema => $self->schema(),
67 people_schema => $self->people_schema(),
68 cache_root=>$cache_root_dir,
69 accession_list => \@accession_ids,
70 trial_list => $self->trials(),
71 protocol_id_list => \@protocol_ids,
72 genotypeprop_hash_select=>$genotypeprop_hash_select, #THESE ARE THE KEYS IN THE GENOTYPEPROP OBJECT
73 protocolprop_top_key_select=>$protocolprop_top_key_select, #THESE ARE THE KEYS AT THE TOP LEVEL OF THE PROTOCOLPROP OBJECT
74 protocolprop_marker_hash_select=>$protocolprop_marker_hash_select, #THESE ARE THE KEYS IN THE MARKERS OBJECT IN THE PROTOCOLPROP OBJECT
75 return_only_first_genotypeprop_for_stock=>$return_only_first_genotypeprop_for_stock, #FOR MEMORY REASONS TO LIMIT DATA
76 forbid_cache=>$forbid_cache
78 my @required_config = (
79 $cluster_shared_tempdir_config,
80 $backend_config,
81 $cluster_host_config,
82 $web_cluster_queue_config,
83 $basepath_config
86 my $filehandle = $genotypes_search->get_cached_file_VCF(@required_config);
87 print STDERR "Checking if a file was requested...\n";
89 if ($file) {
90 print STDERR "Generating the file $file ...\n";
91 open(my $F, ">", $file) || die "Can't open file $file";
92 while(<$filehandle>) {
93 print $F $_;
95 seek $filehandle, 0,0; # rewind filehandle to the beginning of the file
96 print STDERR "Done.\n";
97 close($F);
100 return $filehandle
104 override('retrieve_genotypes',
105 sub {
106 my $self = shift;
107 my $protocol_id = shift;
108 my $file = shift || $self->file_name()."_genotype.txt";
109 my $cache_root_dir = shift;
110 my $cluster_shared_tempdir_config = shift;
111 my $backend_config = shift;
112 my $cluster_host_config = shift;
113 my $web_cluster_queue_config = shift;
114 my $basepath_config = shift;
115 my $forbid_cache = shift;
118 my $genotypeprop_hash_select = shift || ['DS'];
119 my $protocolprop_top_key_select = shift || [];
120 my $protocolprop_marker_hash_select = shift || [];
121 my $return_only_first_genotypeprop_for_stock = shift || 1;
123 my $accessions_list_ref = $self->retrieve_accessions();
124 my @accession_ids;
125 foreach (@$accessions_list_ref) {
126 push @accession_ids, $_->[0];
129 my $genotyping_protocol_ref = $self->retrieve_genotyping_protocols();
130 my @protocols;
131 foreach my $p (@$genotyping_protocol_ref) {
132 push @protocols, $p->[0];
135 print STDERR "PROTOCOLS IN RETRIEVE_GENOTYPES: ".join(", ",@protocols)."\n";
136 my @accessions_list = @$accessions_list_ref;
137 my $genotypes_search = CXGN::Genotype::Search->new(
138 bcs_schema => $self->schema(),
139 people_schema => $self->people_schema(),
140 cache_root=>$cache_root_dir,
141 accession_list => \@accession_ids,
142 trial_list => $self->trials(),
143 protocol_id_list => \@protocols,
144 genotypeprop_hash_select=>$genotypeprop_hash_select, #THESE ARE THE KEYS IN THE GENOTYPEPROP OBJECT
145 protocolprop_top_key_select=>$protocolprop_top_key_select, #THESE ARE THE KEYS AT THE TOP LEVEL OF THE PROTOCOLPROP OBJECT
146 protocolprop_marker_hash_select=>$protocolprop_marker_hash_select, #THESE ARE THE KEYS IN THE MARKERS OBJECT IN THE PROTOCOLPROP OBJECT
147 return_only_first_genotypeprop_for_stock=>$return_only_first_genotypeprop_for_stock, #FOR MEMORY REASONS TO LIMIT DATA
148 forbid_cache=>$forbid_cache
151 print STDERR "DONE WITH GENO SEARCH!\n";
153 my @required_config = (
154 $cluster_shared_tempdir_config,
155 $backend_config,
156 $cluster_host_config,
157 $web_cluster_queue_config,
158 $basepath_config
161 my $fh = $genotypes_search->get_cached_file_dosage_matrix(@required_config);
163 print STDERR "DONE GETTING DOSAGE MATRIX\n";
166 if ($file) {
167 print STDERR "Generating the file $file ...\n";
168 open(my $F, ">", $file) || die "Can't open file $file";
169 while(<$fh>) {
170 print $F $_;
172 print STDERR "Done.\n";
173 close($F);
174 seek $fh, 0, 0; # reset the filehandle
177 return $fh;
183 override('retrieve_phenotypes',
184 sub {
185 my $self = shift;
186 my $file = shift || $self->file_name()."_phenotype.txt";
187 my $phenotypes = $self->SUPER::retrieve_phenotypes();
188 my $phenotype_string = "";
189 my $s;
190 foreach my $line (@$phenotypes) {
191 if ($self->quotes()) {
192 no warnings; # turn off warnings, otherwise there are a lot of undefined warnings.
193 $s = join("\t", map { "\"$_\"" } @$line);
195 else {
196 no warnings;
197 $s = join("\t", @$line);
199 # $s = "";
200 # my $num_col = scalar(@{$line});
201 # for (my $j = 0; $j < $num_col; $j++) {
202 # if (@$line[$j]) {
203 # if ($s eq "") {
204 # $s .= "\"@$line[$j]\"";
205 # } else {
206 # $s .= "\t\"@$line[$j]\"";
208 # } else {
209 # $s .= "\t";
211 # }
213 $s =~ s/\n//g;
214 $s =~ s/\r//g;
215 $phenotype_string .= $s."\n";
217 write_file($file, $phenotype_string);
218 return $phenotypes;
221 override('retrieve_accessions',
222 sub {
223 my $self = shift;
224 my $file = shift || $self->file_name()."_accessions.txt";
225 my $accessions = $self->SUPER::retrieve_accessions();
226 my $accession_json = JSON::Any->encode($accessions);
227 write_file($file, $accession_json);
228 return $accessions;
231 override('retrieve_plots',
232 sub {
233 my $self = shift;
234 my $file = shift || $self->file_name()."_plots.txt";
235 my $plots = $self->SUPER::retrieve_plots();
236 my $plot_json = JSON::Any->encode($plots);
237 write_file($file, $plot_json);
238 return $plots;
241 override('retrieve_trials',
242 sub {
243 my $self = shift;
244 my $file = shift || $self->file_name()."_trials.txt";
245 my $trials = $self->SUPER::retrieve_trials();
246 my $trial_json = JSON::Any->encode($trials);
247 write_file($file, $trial_json);
248 return $trials;
251 override('retrieve_traits',
252 sub {
253 my $self = shift;
254 my $file = shift || $self->file_name()."_traits.txt";
255 my $traits = $self->SUPER::retrieve_traits();
256 my $trait_json = JSON::Any->encode($traits);
257 write_file($file, $trait_json);
258 return $traits;
261 override('retrieve_years',
262 sub {
263 my $self = shift;
264 my $file = shift || $self->file_name()."_years.txt";
265 my $years = $self->SUPER::retrieve_years();
266 my $year_json = JSON::Any->encode($years);
267 write_file($file, $year_json);
268 return $years;