modified validation for parents and error messages
[sgn.git] / lib / CXGN / Pedigree / ParseUpload / Plugin / CrossesExcelFormat.pm
blob09c752582b569baa2a3431231dfb42d740c5f694
1 package CXGN::Pedigree::ParseUpload::Plugin::CrossesExcelFormat;
3 use Moose::Role;
4 use Spreadsheet::ParseExcel;
5 use CXGN::Stock::StockLookup;
6 use SGN::Model::Cvterm;
7 use Data::Dumper;
8 use CXGN::List::Validate;
10 sub _validate_with_plugin {
11 my $self = shift;
12 my $filename = $self->get_filename();
13 my $schema = $self->get_chado_schema();
14 my $cross_properties = $self->get_cross_properties();
15 my @error_messages;
16 my %errors;
17 my %supported_cross_types;
18 my $parser = Spreadsheet::ParseExcel->new();
19 my $excel_obj;
20 my $worksheet;
22 #currently supported cross types
23 $supported_cross_types{'biparental'} = 1; #both parents required
24 $supported_cross_types{'self'} = 1; #only female parent required
25 $supported_cross_types{'open'} = 1; #only female parent required
26 $supported_cross_types{'bulk'} = 1; #both parents required
27 $supported_cross_types{'bulk_self'} = 1; #only female parent required
28 $supported_cross_types{'bulk_open'} = 1; #only female parent required
29 $supported_cross_types{'doubled_haploid'} = 1; #only female parent required
31 #try to open the excel file and report any errors
32 $excel_obj = $parser->parse($filename);
33 if ( !$excel_obj ) {
34 push @error_messages, $parser->error();
35 $errors{'error_messages'} = \@error_messages;
36 $self->_set_parse_errors(\%errors);
37 return;
40 $worksheet = ( $excel_obj->worksheets() )[0]; #support only one worksheet
41 if (!$worksheet) {
42 push @error_messages, "Spreadsheet must be on 1st tab in Excel (.xls) file";
43 $errors{'error_messages'} = \@error_messages;
44 $self->_set_parse_errors(\%errors);
45 return;
47 my ( $row_min, $row_max ) = $worksheet->row_range();
48 my ( $col_min, $col_max ) = $worksheet->col_range();
49 if (($col_max - $col_min) < 3 || ($row_max - $row_min) < 1 ) { #must have header and at least one row of crosses
50 push @error_messages, "Spreadsheet is missing header or contains no row";
51 $errors{'error_messages'} = \@error_messages;
52 $self->_set_parse_errors(\%errors);
53 return;
56 #get column headers
57 my $cross_name_header;
58 my $cross_combination_head;
59 my $cross_type_header;
60 my $female_parent_header;
61 my $male_parent_header;
62 my $female_plot_plant_header;
63 my $male_plot_plant_header;
65 if ($worksheet->get_cell(0,0)) {
66 $cross_name_header = $worksheet->get_cell(0,0)->value();
68 if ($worksheet->get_cell(0,1)) {
69 $cross_combination_head = $worksheet->get_cell(0,1)->value();
71 if ($worksheet->get_cell(0,2)) {
72 $cross_type_header = $worksheet->get_cell(0,2)->value();
74 if ($worksheet->get_cell(0,3)) {
75 $female_parent_header = $worksheet->get_cell(0,3)->value();
77 if ($worksheet->get_cell(0,4)) {
78 $male_parent_header = $worksheet->get_cell(0,4)->value();
80 if ($worksheet->get_cell(0,5)) {
81 $female_plot_plant_header = $worksheet->get_cell(0,5)->value();
83 if ($worksheet->get_cell(0,6)) {
84 $male_plot_plant_header = $worksheet->get_cell(0,6)->value();
87 if (!$cross_name_header || $cross_name_header ne 'cross_unique_id' ) {
88 push @error_messages, "Cell A1: cross_unique_id is missing from the header";
90 if (!$cross_type_header || $cross_type_header ne 'cross_type') {
91 push @error_messages, "Cell C1: cross_type is missing from the header";
93 if (!$female_parent_header || $female_parent_header ne 'female_parent') {
94 push @error_messages, "Cell D1: female_parent is missing from the header";
96 if (!$male_parent_header || $male_parent_header ne 'male_parent') {
97 push @error_messages, "Cell E1: male_parent is missing from the header";
99 if (!$female_plot_plant_header || (($female_plot_plant_header ne 'female_plot') && ($female_plot_plant_header ne 'female_plant'))) {
100 push @error_messages, "Cell F1: female_plot or female_plant is missing from the header";
102 if (!$male_plot_plant_header || (($male_plot_plant_header ne 'male_plot') && ($male_plot_plant_header ne 'male_plant'))) {
103 push @error_messages, "Cell G1: male_plot or male_plant is missing from the header";
106 my %valid_properties;
107 my @properties = @{$cross_properties};
108 foreach my $property(@properties){
109 $valid_properties{$property} = 1;
112 for my $column ( 7 .. $col_max ) {
113 my $header_string = $worksheet->get_cell(0,$column)->value();
115 if (!$valid_properties{$header_string}){
116 push @error_messages, "Invalid info type: $header_string";
120 my %seen_cross_names;
121 my %seen_accession_names;
122 my %seen_plot_plant_names;
124 for my $row ( 1 .. $row_max ) {
125 my $row_name = $row+1;
126 my $cross_name;
127 my $cross_combination;
128 my $cross_type;
129 my $female_parent;
130 my $male_parent;
131 my $female_plot_plant_name;
132 my $male_plot_plant_name;
134 if ($worksheet->get_cell($row,0)) {
135 $cross_name = $worksheet->get_cell($row,0)->value();
138 if ($worksheet->get_cell($row,1)) {
139 $cross_combination = $worksheet->get_cell($row,1)->value();
142 if ($worksheet->get_cell($row,2)) {
143 $cross_type = $worksheet->get_cell($row,2)->value();
146 if ($worksheet->get_cell($row,3)) {
147 $female_parent = $worksheet->get_cell($row,3)->value();
150 #skip blank lines or lines with no name, type and parent
151 if (!$cross_name && !$cross_type && !$female_parent) {
152 next;
155 if ($worksheet->get_cell($row,4)) {
156 $male_parent = $worksheet->get_cell($row,4)->value();
159 if ($worksheet->get_cell($row,5)) {
160 $female_plot_plant_name = $worksheet->get_cell($row,5)->value();
163 if ($worksheet->get_cell($row,6)) {
164 $male_plot_plant_name = $worksheet->get_cell($row,6)->value();
167 for my $column ( 7 .. $col_max ) {
168 if ($worksheet->get_cell($row,$column)) {
169 my $info_value = $worksheet->get_cell($row,$column)->value();
170 my $info_type = $worksheet->get_cell(0,$column)->value();
171 if ( ($info_type =~ m/days/ || $info_type =~ m/number/) && !($info_value =~ /^\d+?$/) ) {
172 push @error_messages, "Cell $info_type:$row_name: is not a positive integer: $info_value";
174 elsif ( $info_type =~ m/date/ && !($info_value =~ m/(\d{4})\/(\d{2})\/(\d{2})/) ) {
175 push @error_messages, "Cell $info_type:$row_name: is not a valid date: $info_value. Dates need to be of form YYYY/MM/DD";
180 #cross name must not be blank
181 if (!$cross_name || $cross_name eq '') {
182 push @error_messages, "Cell A$row_name: cross unique id missing";
183 } else {
184 $cross_name =~ s/^\s+|\s+$//g; #trim whitespace from front and end.
186 # } elsif ($cross_name =~ /\s/ || $cross_name =~ /\// || $cross_name =~ /\\/ ) {
187 # push @error_messages, "Cell A$row_name: cross_name must not contain spaces or slashes.";
188 if ($seen_cross_names{$cross_name}) {
189 push @error_messages, "Cell A$row_name: duplicate cross unique id: $cross_name";
192 #cross type must not be blank
193 if (!$cross_type || $cross_type eq '') {
194 push @error_messages, "Cell C$row_name: cross type missing";
195 } elsif (!$supported_cross_types{$cross_type}){
196 push @error_messages, "Cell C$row_name: cross type not supported: $cross_type";
199 #female parent must not be blank
200 if (!$female_parent || $female_parent eq '') {
201 push @error_messages, "Cell D$row_name: female parent missing";
204 #male parent must not be blank if type is biparental or bulk
205 if (!$male_parent || $male_parent eq '') {
206 if ($cross_type eq ( 'biparental' || 'bulk' )) {
207 push @error_messages, "Cell E$row_name: male parent required for biparental and bulk crosses";
211 if ($cross_name){
212 $seen_cross_names{$cross_name}++;
215 if ($female_parent){
216 $seen_accession_names{$female_parent}++;
219 if ($male_parent){
220 $seen_accession_names{$male_parent}++;
223 if ($female_plot_plant_name){
224 $seen_plot_plant_names{$female_plot_plant_name}++;
227 if ($male_plot_plant_name){
228 $seen_plot_plant_names{$male_plot_plant_name}++;
232 my @accessions = keys %seen_accession_names;
233 my $accession_validator = CXGN::List::Validate->new();
234 my @accessions_missing = @{$accession_validator->validate($schema,'uniquenames',\@accessions)->{'missing'}};
236 my $population_validator = CXGN::List::Validate->new();
237 my @parents_missing = @{$population_validator->validate($schema,'populations',\@accessions_missing)->{'missing'}};
239 if (scalar(@parents_missing) > 0) {
240 push @error_messages, "The following accessions or populations are not in the database or are not in the database as uniquenames: ".join(',',@parents_missing);
241 $errors{'missing_accessions'} = \@parents_missing;
244 if (($female_plot_plant_header eq 'female_plot') && ($male_plot_plant_header eq 'male_plot')) {
245 my @plots = keys %seen_plot_plant_names;
246 my $plot_validator = CXGN::List::Validate->new();
247 my @plots_missing = @{$plot_validator->validate($schema,'plots',\@plots)->{'missing'}};
249 if (scalar(@plots_missing) > 0) {
250 push @error_messages, "The following plots are not in the database or are not in the database as uniquenames: ".join(',',@plots_missing);
251 $errors{'missing_plots'} = \@plots_missing;
253 } elsif (($female_plot_plant_header eq 'female_plant') && ($male_plot_plant_header eq 'male_plant')) {
254 my @plants = keys %seen_plot_plant_names;
255 my $plant_validator = CXGN::List::Validate->new();
256 my @plants_missing = @{$plant_validator->validate($schema,'plants',\@plants)->{'missing'}};
258 if (scalar(@plants_missing) > 0) {
259 push @error_messages, "The following plants are not in the database as uniquenames or synonyms: ".join(',',@plants_missing);
260 $errors{'missing_plants'} = \@plants_missing;
264 my @crosses = keys %seen_cross_names;
265 my $rs = $schema->resultset("Stock::Stock")->search({
266 'is_obsolete' => { '!=' => 't' },
267 'uniquename' => { -in => \@crosses }
269 while (my $r=$rs->next){
270 push @error_messages, "Cross unique id already exists in database: ".$r->uniquename;
273 #store any errors found in the parsed file to parse_errors accessor
274 if (scalar(@error_messages) >= 1) {
275 $errors{'error_messages'} = \@error_messages;
276 $self->_set_parse_errors(\%errors);
277 return;
280 return 1; #returns true if validation is passed
284 sub _parse_with_plugin {
285 my $self = shift;
286 my $filename = $self->get_filename();
287 my $schema = $self->get_chado_schema();
288 my $parser = Spreadsheet::ParseExcel->new();
289 my $excel_obj;
290 my $worksheet;
291 my @pedigrees;
292 my %additional_properties;
293 my %properties_columns;
294 my %parsed_result;
296 $excel_obj = $parser->parse($filename);
297 if ( !$excel_obj ) {
298 return;
301 $worksheet = ( $excel_obj->worksheets() )[0];
302 my ( $row_min, $row_max ) = $worksheet->row_range();
303 my ( $col_min, $col_max ) = $worksheet->col_range();
305 my $female_plot_plant_header = $worksheet->get_cell(0,4)->value();
306 my $male_plot_plant_header = $worksheet->get_cell(0,5)->value();
308 for my $column ( 6 .. $col_max ) {
309 my $header_string = $worksheet->get_cell(0,$column)->value();
311 $properties_columns{$column} = $header_string;
312 $additional_properties{$header_string} = ();
315 for my $row ( 1 .. $row_max ) {
316 my $cross_name;
317 my $cross_combination;
318 my $cross_type;
319 my $female_parent;
320 my $male_parent;
321 my $female_plot;
322 my $male_plot;
323 my $female_plant;
324 my $male_plant;
325 my $cross_stock;
327 if ($worksheet->get_cell($row,0)) {
328 $cross_name = $worksheet->get_cell($row,0)->value();
329 $cross_name =~ s/^\s+|\s+$//g; #trim whitespace from front and end...
332 if ($worksheet->get_cell($row,1)) {
333 $cross_combination = $worksheet->get_cell($row,1)->value();
336 if ($worksheet->get_cell($row,2)) {
337 $cross_type = $worksheet->get_cell($row,2)->value();
340 if ($worksheet->get_cell($row,3)) {
341 $female_parent = $worksheet->get_cell($row,3)->value();
344 #skip blank lines or lines with no name, type and parent
345 if (!$cross_name && !$cross_type && !$female_parent) {
346 next;
348 if ($worksheet->get_cell($row,4)) {
349 $male_parent = $worksheet->get_cell($row,4)->value();
352 if ($worksheet->get_cell($row,5)) {
353 if ($female_plot_plant_header eq 'female_plot') {
354 $female_plot = $worksheet->get_cell($row,5)->value();
355 } elsif ($female_plot_plant_header eq 'female_plant') {
356 $female_plant = $worksheet->get_cell($row,5)->value();
360 if ($worksheet->get_cell($row,6)) {
361 if ($male_plot_plant_header eq 'male_plot') {
362 $male_plot = $worksheet->get_cell($row,6)->value();
363 } elsif ($male_plot_plant_header eq 'male_plant') {
364 $male_plant = $worksheet->get_cell($row,6)->value();
368 for my $column ( 7 .. $col_max ) {
369 if ($worksheet->get_cell($row,$column)) {
370 my $column_property = $properties_columns{$column};
371 $additional_properties{$column_property}{$cross_name} = $worksheet->get_cell($row,$column)->value();
372 my $info_type = $worksheet->get_cell(0,$column)->value();
373 $parsed_result{$info_type} = $additional_properties{$column_property};
377 my $pedigree = Bio::GeneticRelationships::Pedigree->new(name=>$cross_name, cross_combination=>$cross_combination, cross_type=>$cross_type);
378 if ($female_parent) {
379 my $female_parent_individual = Bio::GeneticRelationships::Individual->new(name => $female_parent);
380 $pedigree->set_female_parent($female_parent_individual);
382 if ($male_parent) {
383 my $male_parent_individual = Bio::GeneticRelationships::Individual->new(name => $male_parent);
384 $pedigree->set_male_parent($male_parent_individual);
386 if ($female_plot) {
387 my $female_plot_individual = Bio::GeneticRelationships::Individual->new(name => $female_plot);
388 $pedigree->set_female_plot($female_plot_individual);
390 if ($male_plot) {
391 my $male_plot_individual = Bio::GeneticRelationships::Individual->new(name => $male_plot);
392 $pedigree->set_male_plot($male_plot_individual);
394 if ($female_plant) {
395 my $female_plant_individual = Bio::GeneticRelationships::Individual->new(name => $female_plant);
396 $pedigree->set_female_plant($female_plant_individual);
398 if ($male_plant) {
399 my $male_plant_individual = Bio::GeneticRelationships::Individual->new(name => $male_plant);
400 $pedigree->set_male_plant($male_plant_individual);
403 push @pedigrees, $pedigree;
407 $parsed_result{'crosses'} = \@pedigrees;
409 $self->_set_parsed_data(\%parsed_result);
411 return 1;