parse trial upload with seedlot_name and num_seed_per_plot
[sgn.git] / lib / CXGN / Trial / ParseUpload / Plugin / TrialExcelFormat.pm
blob758c30a3b509cb21bd5a27058597abc14e4b0cd8
1 package CXGN::Trial::ParseUpload::Plugin::TrialExcelFormat;
3 use Moose::Role;
4 use Spreadsheet::ParseExcel;
5 use CXGN::Stock::StockLookup;
6 use SGN::Model::Cvterm;
7 use Data::Dumper;
8 use CXGN::List::Validate;
10 sub _validate_with_plugin {
11 print STDERR "Check 3.1.1 ".localtime();
12 my $self = shift;
13 my $filename = $self->get_filename();
14 my $schema = $self->get_chado_schema();
15 my %errors;
16 my @error_messages;
17 my %missing_accessions;
18 my %supported_trial_types;
19 my $parser = Spreadsheet::ParseExcel->new();
20 my $excel_obj;
21 my $worksheet;
22 my %seen_plot_names;
23 my %seen_accession_names;
24 my %seen_seedlot_names;
26 #currently supported trial types
27 $supported_trial_types{'biparental'} = 1; #both parents required
28 $supported_trial_types{'self'} = 1; #only female parent required
29 $supported_trial_types{'open'} = 1; #only female parent required
31 #try to open the excel file and report any errors
32 $excel_obj = $parser->parse($filename);
33 if ( !$excel_obj ) {
34 push @error_messages, $parser->error();
35 $errors{'error_messages'} = \@error_messages;
36 $self->_set_parse_errors(\%errors);
37 return;
40 print STDERR "Check 3.1.2 ".localtime();
42 $worksheet = ( $excel_obj->worksheets() )[0]; #support only one worksheet
43 if (!$worksheet) {
44 push @error_messages, "Spreadsheet must be on 1st tab in Excel (.xls) file";
45 $errors{'error_messages'} = \@error_messages;
46 $self->_set_parse_errors(\%errors);
47 return;
49 my ( $row_min, $row_max ) = $worksheet->row_range();
50 my ( $col_min, $col_max ) = $worksheet->col_range();
51 if (($col_max - $col_min) < 2 || ($row_max - $row_min) < 1 ) { #must have header and at least one row of plot data
52 push @error_messages, "Spreadsheet is missing header or contains no rows";
53 $errors{'error_messages'} = \@error_messages;
54 $self->_set_parse_errors(\%errors);
55 return;
58 #get column headers
59 my $plot_name_head;
60 my $accession_name_head;
61 my $seedlot_name_head;
62 my $num_seed_per_plot_head;
63 my $plot_number_head;
64 my $block_number_head;
65 my $is_a_control_head;
66 my $rep_number_head;
67 my $range_number_head;
68 my $row_number_head;
69 my $col_number_head;
71 if ($worksheet->get_cell(0,0)) {
72 $plot_name_head = $worksheet->get_cell(0,0)->value();
74 if ($worksheet->get_cell(0,1)) {
75 $accession_name_head = $worksheet->get_cell(0,1)->value();
77 if ($worksheet->get_cell(0,2)) {
78 $plot_number_head = $worksheet->get_cell(0,2)->value();
80 if ($worksheet->get_cell(0,3)) {
81 $block_number_head = $worksheet->get_cell(0,3)->value();
83 if ($worksheet->get_cell(0,4)) {
84 $is_a_control_head = $worksheet->get_cell(0,4)->value();
86 if ($worksheet->get_cell(0,5)) {
87 $rep_number_head = $worksheet->get_cell(0,5)->value();
89 if ($worksheet->get_cell(0,6)) {
90 $range_number_head = $worksheet->get_cell(0,6)->value();
92 if ($worksheet->get_cell(0,7)) {
93 $row_number_head = $worksheet->get_cell(0,7)->value();
95 if ($worksheet->get_cell(0,8)) {
96 $col_number_head = $worksheet->get_cell(0,8)->value();
98 if ($worksheet->get_cell(0,9)) {
99 $seedlot_name_head = $worksheet->get_cell(0,9)->value();
101 if ($worksheet->get_cell(0,10)) {
102 $num_seed_per_plot_head = $worksheet->get_cell(0,10)->value();
105 my @treatment_names;
106 for (11 .. $col_max){
107 if ($worksheet->get_cell(0,$_)){
108 push @treatment_names, $worksheet->get_cell(0,$_)->value();
112 if (!$plot_name_head || $plot_name_head ne 'plot_name' ) {
113 push @error_messages, "Cell A1: plot_name is missing from the header";
115 if (!$accession_name_head || $accession_name_head ne 'accession_name') {
116 push @error_messages, "Cell B1: accession_name is missing from the header";
118 if (!$plot_number_head || $plot_number_head ne 'plot_number') {
119 push @error_messages, "Cell C1: plot_number is missing from the header";
121 if (!$block_number_head || $block_number_head ne 'block_number') {
122 push @error_messages, "Cell D1: block_number is missing from the header";
124 if (!$is_a_control_head || $is_a_control_head ne 'is_a_control') {
125 push @error_messages, "Cell E1: is_a_control is missing from the header. (Header is required, but values are optional)";
127 if (!$rep_number_head || $rep_number_head ne 'rep_number') {
128 push @error_messages, "Cell F1: rep_number is missing from the header. (Header is required, but values are optional)";
130 if (!$range_number_head || $range_number_head ne 'range_number') {
131 push @error_messages, "Cell G1: range_number is missing from the header. (Header is required, but values are optional)";
133 if (!$row_number_head || $row_number_head ne 'row_number') {
134 push @error_messages, "Cell H1: row_number is missing from the header. (Header is required, but values are optional)";
136 if (!$col_number_head || $col_number_head ne 'col_number') {
137 push @error_messages, "Cell I1: col_number is missing from the header. (Header is required, but values are optional)";
139 if (!$seedlot_name_head || $seedlot_name_head ne 'seedlot_name') {
140 push @error_messages, "Cell J1: seedlot_name is missing from the header. (Header is required, but values are optional)";
142 if (!$num_seed_per_plot_head || $num_seed_per_plot_head ne 'num_seed_per_plot') {
143 push @error_messages, "Cell K1: num_seed_per_plot is missing from the header. (Header is required, but values are optional)";
146 for my $row ( 1 .. $row_max ) {
147 #print STDERR "Check 01 ".localtime();
148 my $row_name = $row+1;
149 my $plot_name;
150 my $accession_name;
151 my $seedlot_name;
152 my $num_seed_per_plot = 0;
153 my $plot_number;
154 my $block_number;
155 my $is_a_control;
156 my $rep_number;
157 my $range_number;
158 my $row_number;
159 my $col_number;
161 if ($worksheet->get_cell($row,0)) {
162 $plot_name = $worksheet->get_cell($row,0)->value();
164 if ($worksheet->get_cell($row,1)) {
165 $accession_name = $worksheet->get_cell($row,1)->value();
167 if ($worksheet->get_cell($row,2)) {
168 $plot_number = $worksheet->get_cell($row,2)->value();
170 if ($worksheet->get_cell($row,3)) {
171 $block_number = $worksheet->get_cell($row,3)->value();
173 if ($worksheet->get_cell($row,4)) {
174 $is_a_control = $worksheet->get_cell($row,4)->value();
176 if ($worksheet->get_cell($row,5)) {
177 $rep_number = $worksheet->get_cell($row,5)->value();
179 if ($worksheet->get_cell($row,6)) {
180 $range_number = $worksheet->get_cell($row,6)->value();
182 if ($worksheet->get_cell($row, 7)) {
183 $row_number = $worksheet->get_cell($row, 7)->value();
185 if ($worksheet->get_cell($row, 8)) {
186 $col_number = $worksheet->get_cell($row, 8)->value();
188 if ($worksheet->get_cell($row,9)) {
189 $seedlot_name = $worksheet->get_cell($row,9)->value();
191 if ($worksheet->get_cell($row,10)) {
192 $num_seed_per_plot = $worksheet->get_cell($row,10)->value();
195 #skip blank lines
196 if (!$plot_name && !$accession_name && !$plot_number && !$block_number) {
197 next;
200 #print STDERR "Check 02 ".localtime();
202 #plot_name must not be blank
203 if (!$plot_name || $plot_name eq '' ) {
204 push @error_messages, "Cell A$row_name: plot name missing.";
206 elsif ($plot_name =~ /\s/ || $plot_name =~ /\// || $plot_name =~ /\\/ ) {
207 push @error_messages, "Cell A$row_name: plot name must not contain spaces or slashes.";
209 else {
210 #file must not contain duplicate plot names
211 if ($seen_plot_names{$plot_name}) {
212 push @error_messages, "Cell A$row_name: duplicate plot name at cell A".$seen_plot_names{$plot_name}.": $plot_name";
214 $seen_plot_names{$plot_name}=$row_name;
217 #print STDERR "Check 03 ".localtime();
219 #accession name must not be blank
220 if (!$accession_name || $accession_name eq '') {
221 push @error_messages, "Cell B$row_name: accession name missing";
222 } else {
223 #accession name must exist in the database
224 $seen_accession_names{$accession_name}++;
227 #print STDERR "Check 04 ".localtime();
229 #plot number must not be blank
230 if (!$plot_number || $plot_number eq '') {
231 push @error_messages, "Cell C$row_name: plot number missing";
233 #plot number must be a positive integer
234 if (!($plot_number =~ /^\d+?$/)) {
235 push @error_messages, "Cell C$row_name: plot number is not a positive integer: $plot_number";
237 #block number must not be blank
238 if (!$block_number || $block_number eq '') {
239 push @error_messages, "Cell D$row_name: block number missing";
241 #block number must be a positive integer
242 if (!($block_number =~ /^\d+?$/)) {
243 push @error_messages, "Cell D$row_name: block number is not a positive integer: $block_number";
245 if ($is_a_control) {
246 #is_a_control must be either yes, no 1, 0, or blank
247 if (!($is_a_control eq "yes" || $is_a_control eq "no" || $is_a_control eq "1" ||$is_a_control eq "0" || $is_a_control eq '')) {
248 push @error_messages, "Cell E$row_name: is_a_control is not either yes, no 1, 0, or blank: $is_a_control";
251 if ($rep_number && !($rep_number =~ /^\d+?$/)){
252 push @error_messages, "Cell F$row_name: rep_number must be a positive integer: $rep_number";
254 if ($range_number && !($range_number =~ /^\d+?$/)){
255 push @error_messages, "Cell G$row_name: range_number must be a positive integer: $range_number";
257 if ($row_number && !($row_number =~ /^\d+?$/)){
258 push @error_messages, "Cell H$row_name: row_number must be a positive integer: $row_number";
260 if ($col_number && !($col_number =~ /^\d+?$/)){
261 push @error_messages, "Cell I$row_name: col_number must be a positive integer: $col_number";
264 if ($seedlot_name){
265 $seen_seedlot_names{$seedlot_name}++;
267 if (defined($num_seed_per_plot) && !($num_seed_per_plot =~ /^\d+?$/)){
268 push @error_messages, "Cell K$row_name: num_seed_per_plot must be a positive integer: $num_seed_per_plot";
271 my $treatment_col = 10;
272 foreach my $treatment_name (@treatment_names){
273 if($worksheet->get_cell($row,$treatment_col)){
274 my $apply_treatment = $worksheet->get_cell($row,$treatment_col);
275 if (defined($apply_treatment) && $apply_treatment ne '1'){
276 push @error_messages, "Treatment value in row $row_name should be either 1 or empty";
279 $treatment_col++;
284 my @accessions = keys %seen_accession_names;
285 my $accession_validator = CXGN::List::Validate->new();
286 my @accessions_missing = @{$accession_validator->validate($schema,'accessions',\@accessions)->{'missing'}};
288 if (scalar(@accessions_missing) > 0) {
289 $errors{'missing_accessions'} = \@accessions_missing;
290 push @error_messages, "The following accessions are not in the database as uniquenames or synonyms: ".join(',',@accessions_missing);
293 my @seedlot_names = keys %seen_seedlot_names;
294 if (scalar(@seedlot_names)>0){
295 my $seedlot_validator = CXGN::List::Validate->new();
296 my @seedlots_missing = @{$seedlot_validator->validate($schema,'seedlots',\@seedlot_names)->{'missing'}};
298 if (scalar(@seedlots_missing) > 0) {
299 $errors{'missing_seedlots'} = \@seedlots_missing;
300 push @error_messages, "The following seedlots are not in the database: ".join(',',@seedlots_missing);
304 my @plots = keys %seen_plot_names;
305 my $rs = $schema->resultset("Stock::Stock")->search({
306 'is_obsolete' => { '!=' => 't' },
307 'uniquename' => { -in => \@plots }
309 while (my $r=$rs->next){
310 push @error_messages, "Cell A".$seen_plot_names{$r->uniquename}.": plot name already exists: ".$r->uniquename;
313 #store any errors found in the parsed file to parse_errors accessor
314 if (scalar(@error_messages) >= 1) {
315 $errors{'error_messages'} = \@error_messages;
316 $self->_set_parse_errors(\%errors);
317 return;
320 print STDERR "Check 3.1.3 ".localtime();
322 return 1; #returns true if validation is passed
327 sub _parse_with_plugin {
328 my $self = shift;
329 my $filename = $self->get_filename();
330 my $schema = $self->get_chado_schema();
331 my $parser = Spreadsheet::ParseExcel->new();
332 my $excel_obj;
333 my $worksheet;
334 my %design;
336 $excel_obj = $parser->parse($filename);
337 if ( !$excel_obj ) {
338 return;
341 $worksheet = ( $excel_obj->worksheets() )[0];
342 my ( $row_min, $row_max ) = $worksheet->row_range();
343 my ( $col_min, $col_max ) = $worksheet->col_range();
345 my @treatment_names;
346 for (10 .. $col_max){
347 if ($worksheet->get_cell(0,$_)){
348 push @treatment_names, $worksheet->get_cell(0,$_)->value();
352 for my $row ( 1 .. $row_max ) {
353 my $plot_name;
354 my $accession_name;
355 my $plot_number;
356 my $block_number;
357 my $is_a_control;
358 my $rep_number;
359 my $range_number;
360 my $row_number;
361 my $col_number;
362 my $seedlot_name;
363 my $num_seed_per_plot = 0;
365 if ($worksheet->get_cell($row,0)) {
366 $plot_name = $worksheet->get_cell($row,0)->value();
368 if ($worksheet->get_cell($row,1)) {
369 $accession_name = $worksheet->get_cell($row,1)->value();
371 if ($worksheet->get_cell($row,2)) {
372 $plot_number = $worksheet->get_cell($row,2)->value();
374 if ($worksheet->get_cell($row,3)) {
375 $block_number = $worksheet->get_cell($row,3)->value();
377 if ($worksheet->get_cell($row,4)) {
378 $is_a_control = $worksheet->get_cell($row,4)->value();
380 if ($worksheet->get_cell($row,5)) {
381 $rep_number = $worksheet->get_cell($row,5)->value();
383 if ($worksheet->get_cell($row,6)) {
384 $range_number = $worksheet->get_cell($row,6)->value();
386 if ($worksheet->get_cell($row,7)) {
387 $row_number = $worksheet->get_cell($row, 7)->value();
389 if ($worksheet->get_cell($row,8)) {
390 $col_number = $worksheet->get_cell($row, 8)->value();
392 if ($worksheet->get_cell($row,9)) {
393 $seedlot_name = $worksheet->get_cell($row, 9)->value();
395 if ($worksheet->get_cell($row,10)) {
396 $num_seed_per_plot = $worksheet->get_cell($row, 10)->value();
399 #skip blank lines
400 if (!$plot_name && !$accession_name && !$plot_number && !$block_number) {
401 next;
404 my $treatment_col = 10;
405 foreach my $treatment_name (@treatment_names){
406 if($worksheet->get_cell($row,$treatment_col)){
407 push @{$design{treatments}->{$treatment_name}}, $plot_name;
409 $treatment_col++;
412 my $key = $row;
413 $design{$key}->{plot_name} = $plot_name;
414 $design{$key}->{stock_name} = $accession_name;
415 $design{$key}->{plot_number} = $plot_number;
416 $design{$key}->{block_number} = $block_number;
417 if ($is_a_control) {
418 $design{$key}->{is_a_control} = 1;
419 } else {
420 $design{$key}->{is_a_control} = 0;
422 if ($rep_number) {
423 $design{$key}->{rep_number} = $rep_number;
425 if ($range_number) {
426 $design{$key}->{range_number} = $range_number;
428 if ($row_number) {
429 $design{$key}->{row_number} = $row_number;
431 if ($col_number) {
432 $design{$key}->{col_number} = $col_number;
434 if ($seedlot_name){
435 $design{$key}->{seedlot_name} = $seedlot_name;
436 $design{$key}->{num_seed_per_plot} = $num_seed_per_plot;
440 #print STDERR Dumper \%design;
441 $self->_set_parsed_data(\%design);
443 return 1;