Merge pull request #5205 from solgenomics/topic/generic_trial_upload
[sgn.git] / lib / CXGN / Pedigree / ParseUpload / Plugin / CrossesSimpleExcel.pm
blobdb19559f0e0c0f1f05d6b0f2dbf95e30f415a76b
1 package CXGN::Pedigree::ParseUpload::Plugin::CrossesSimpleExcel;
3 use Moose::Role;
4 use Spreadsheet::ParseExcel;
5 use Spreadsheet::ParseXLSX;
6 use CXGN::Stock::StockLookup;
7 use SGN::Model::Cvterm;
8 use Data::Dumper;
9 use CXGN::List::Validate;
11 sub _validate_with_plugin {
12 my $self = shift;
13 my $filename = $self->get_filename();
14 my $schema = $self->get_chado_schema();
15 my $cross_additional_info = $self->get_cross_additional_info();
16 my @error_messages;
17 my %errors;
18 my %supported_cross_types;
20 # Match a dot, extension .xls / .xlsx
21 my ($extension) = $filename =~ /(\.[^.]+)$/;
22 my $parser;
24 if ($extension eq '.xlsx') {
25 $parser = Spreadsheet::ParseXLSX->new();
27 else {
28 $parser = Spreadsheet::ParseExcel->new();
31 my $excel_obj;
32 my $worksheet;
33 # print STDERR "ADDITIONAL INFO =".Dumper($cross_additional_info)."\n";
35 #currently supported cross types
36 $supported_cross_types{'biparental'} = 1; #both parents required
37 $supported_cross_types{'self'} = 1; #only female parent required
38 $supported_cross_types{'open'} = 1; #only female parent required
39 $supported_cross_types{'sib'} = 1; #both parents required but can be the same.
40 $supported_cross_types{'bulk_self'} = 1; #only female population required
41 $supported_cross_types{'bulk_open'} = 1; #only female population required
42 $supported_cross_types{'bulk'} = 1; #both female population and male accession required
43 $supported_cross_types{'doubled_haploid'} = 1; #only female parent required
44 $supported_cross_types{'dihaploid_induction'} = 1; # only female parent required
45 $supported_cross_types{'polycross'} = 1; #both parents required
46 $supported_cross_types{'backcross'} = 1; #both parents required, parents can be cross or accession stock type
48 #try to open the excel file and report any errors
49 $excel_obj = $parser->parse($filename);
50 if ( !$excel_obj ) {
51 push @error_messages, $parser->error();
52 $errors{'error_messages'} = \@error_messages;
53 $self->_set_parse_errors(\%errors);
54 return;
57 $worksheet = ( $excel_obj->worksheets() )[0]; #support only one worksheet
58 if (!$worksheet) {
59 push @error_messages, "Spreadsheet must be on 1st tab in Excel (.xls) file";
60 $errors{'error_messages'} = \@error_messages;
61 $self->_set_parse_errors(\%errors);
62 return;
64 my ( $row_min, $row_max ) = $worksheet->row_range();
65 my ( $col_min, $col_max ) = $worksheet->col_range();
66 if (($col_max - $col_min) < 4 || ($row_max - $row_min) < 1 ) { #must have header and at least one row of crosses
67 push @error_messages, "Spreadsheet is missing header or contains no row";
68 $errors{'error_messages'} = \@error_messages;
69 $self->_set_parse_errors(\%errors);
70 return;
73 #get column headers
74 my $cross_name_head;
75 my $cross_combination_head;
76 my $cross_type_head;
77 my $female_parent_head;
78 my $male_parent_head;
80 if ($worksheet->get_cell(0,0)) {
81 $cross_name_head = $worksheet->get_cell(0,0)->value();
82 $cross_name_head =~ s/^\s+|\s+$//g;
84 if ($worksheet->get_cell(0,1)) {
85 $cross_combination_head = $worksheet->get_cell(0,1)->value();
86 $cross_combination_head =~ s/^\s+|\s+$//g;
88 if ($worksheet->get_cell(0,2)) {
89 $cross_type_head = $worksheet->get_cell(0,2)->value();
90 $cross_type_head =~ s/^\s+|\s+$//g;
92 if ($worksheet->get_cell(0,3)) {
93 $female_parent_head = $worksheet->get_cell(0,3)->value();
94 $female_parent_head =~ s/^\s+|\s+$//g;
96 if ($worksheet->get_cell(0,4)) {
97 $male_parent_head = $worksheet->get_cell(0,4)->value();
98 $male_parent_head =~ s/^\s+|\s+$//g;
102 if (!$cross_name_head || $cross_name_head ne 'cross_unique_id' ) {
103 push @error_messages, "Cell A1: cross_unique_id is missing from the header";
105 if (!$cross_combination_head || $cross_combination_head ne 'cross_combination') {
106 push @error_messages, "Cell B1: cross_combination is missing from the header";
108 if (!$cross_type_head || $cross_type_head ne 'cross_type') {
109 push @error_messages, "Cell C1: cross_type is missing from the header";
111 if (!$female_parent_head || $female_parent_head ne 'female_parent') {
112 push @error_messages, "Cell D1: female_parent is missing from the header";
114 if (!$male_parent_head || $male_parent_head ne 'male_parent') {
115 push @error_messages, "Cell E1: male_parent is missing from the header";
118 my %valid_additional_info;
119 my @valid_info = @{$cross_additional_info};
120 foreach my $info(@valid_info){
121 $valid_additional_info{$info} = 1;
124 for my $column (5 .. $col_max){
125 if ($worksheet->get_cell(0, $column)) {
126 my $header_string = $worksheet->get_cell(0,$column)->value();
127 $header_string =~ s/^\s+|\s+$//g;
129 if (($header_string) && (!$valid_additional_info{$header_string})){
130 push @error_messages, "Invalid info type: $header_string";
135 my %seen_cross_names;
136 my %seen_accession_names;
137 my %seen_backcross_parents;
138 my %seen_population_names;
140 for my $row ( 1 .. $row_max ) {
141 my $row_name = $row+1;
142 my $cross_name;
143 my $cross_combination;
144 my $cross_type;
145 my $female_parent;
146 my $male_parent;
148 if ($worksheet->get_cell($row,0)) {
149 $cross_name = $worksheet->get_cell($row,0)->value();
151 if ($worksheet->get_cell($row,1)) {
152 $cross_combination = $worksheet->get_cell($row,1)->value();
153 $cross_combination =~ s/^\s+|\s+$//g;
155 if ($worksheet->get_cell($row,2)) {
156 $cross_type = $worksheet->get_cell($row,2)->value();
157 $cross_type =~ s/^\s+|\s+$//g;
159 if ($worksheet->get_cell($row,3)) {
160 $female_parent = $worksheet->get_cell($row,3)->value();
163 if (!defined $cross_name && !defined $cross_type && !defined $female_parent) {
164 last;
167 if ($worksheet->get_cell($row,4)) {
168 $male_parent = $worksheet->get_cell($row,4)->value();
171 $female_parent =~ s/^\s+|\s+$//g;
172 $male_parent =~ s/^\s+|\s+$//g;
174 #cross name must not be blank
175 if (!$cross_name || $cross_name eq '') {
176 push @error_messages, "Cell A$row_name: cross unique id missing";
177 } else {
178 $cross_name =~ s/^\s+|\s+$//g; #trim whitespace from front and end.
180 # } elsif ($cross_name =~ /\s/ || $cross_name =~ /\// || $cross_name =~ /\\/ ) {
181 # push @error_messages, "Cell A$row_name: cross_name must not contain spaces or slashes.";
182 if ($seen_cross_names{$cross_name}) {
183 push @error_messages, "Cell A$row_name: duplicate cross unique id: $cross_name";
186 if (($cross_type eq 'double_haploid') || ($cross_type eq 'dihaploid_induction') || ($cross_type eq 'self')) {
187 if ($female_parent ne $male_parent) {
188 push @error_messages, "For double haploid, dihaploid_induction, and self, female parent needs to be identical to male parent in row $row_name";
194 #cross type must not be blank
195 if (!$cross_type || $cross_type eq '') {
196 push @error_messages, "Cell C$row_name: cross type missing";
197 } elsif (!$supported_cross_types{$cross_type}){
198 push @error_messages, "Cell C$row_name: cross type not supported: $cross_type";
201 #female parent must not be blank
202 if (!$female_parent || $female_parent eq '') {
203 push @error_messages, "Cell D$row_name: female parent missing";
206 #male parent must not be blank if type is biparental, sib, polycross or bulk
207 if (!$male_parent || $male_parent eq '') {
208 if ($cross_type eq ( 'biparental' || 'bulk' || 'sib' || 'polycross' || 'backcross' )) {
209 push @error_messages, "Cell E$row_name: male parent required for biparental, sib, polycross, backcross and bulk cross types";
213 if ($cross_name){
214 $cross_name =~ s/^\s+|\s+$//g;
215 $seen_cross_names{$cross_name}++;
219 if (($cross_type eq 'bulk') || ($cross_type eq 'bulk_self') || ($cross_type eq 'bulk_open')) {
220 #$female_parent =~ s/^\s+|\s+$//g;
221 $seen_population_names{$female_parent}++;
222 if ($cross_type eq 'bulk_open') {
223 if ($male_parent) {
224 #$male_parent =~ s/^\s+|\s+$//g;
225 $seen_population_names{$male_parent}++;
227 } elsif ($cross_type eq 'bulk') {
228 $male_parent =~ s/^\s+|\s+$//g;
229 $seen_accession_names{$male_parent}++;
231 } elsif (($cross_type eq 'polycross') || ($cross_type eq 'open')) {
232 #$female_parent =~ s/^\s+|\s+$//g;
233 $seen_accession_names{$female_parent}++;
234 if ($male_parent) {
235 # $male_parent =~ s/^\s+|\s+$//g;
236 $seen_population_names{$male_parent}++;
238 } elsif ($cross_type eq 'backcross') {
239 #$female_parent =~ s/^\s+|\s+$//g;
240 $seen_backcross_parents{$female_parent}++;
241 #$male_parent =~ s/^\s+|\s+$//g;
242 $seen_backcross_parents{$male_parent}++;
243 } else {
244 #$female_parent =~ s/^\s+|\s+$//g;
245 $seen_accession_names{$female_parent}++;
247 if ($male_parent){
248 # $male_parent =~ s/^\s+|\s+$//g;
249 $seen_accession_names{$male_parent}++;
254 my @accessions = keys %seen_accession_names;
255 my $accession_validator = CXGN::List::Validate->new();
256 my @accessions_missing = @{$accession_validator->validate($schema,'uniquenames',\@accessions)->{'missing'}};
258 if (scalar(@accessions_missing) > 0) {
259 push @error_messages, "The following parents are not in the database, or are not in the database as accession uniquenames: ".join(',',@accessions_missing);
260 $errors{'missing_accessions'} = \@accessions_missing;
263 my @populations = keys %seen_population_names;
264 my $population_validator = CXGN::List::Validate->new();
265 my @populations_missing = @{$population_validator->validate($schema,'populations',\@populations)->{'missing'}};
267 if (scalar(@populations_missing) > 0) {
268 push @error_messages, "The following parents are not in the database, or are not in the database as population uniquenames: ".join(',',@populations_missing);
271 my @backcross_parents = keys %seen_backcross_parents;
272 my $backcross_parent_validator = CXGN::List::Validate->new();
273 my @backcross_parents_missing = @{$backcross_parent_validator->validate($schema,'accessions_or_crosses',\@backcross_parents)->{'missing'}};
275 if (scalar(@backcross_parents_missing) > 0) {
276 push @error_messages, "The following parents are not in the database, or are not in the database as uniquenames: ".join(',',@backcross_parents_missing);
277 $errors{'missing_accessions_or_crosses'} = \@backcross_parents_missing;
280 my @crosses = keys %seen_cross_names;
281 my $rs = $schema->resultset("Stock::Stock")->search({
282 'is_obsolete' => { '!=' => 't' },
283 'uniquename' => { -in => \@crosses }
285 while (my $r=$rs->next){
286 push @error_messages, "Cross unique id already exists in database: ".$r->uniquename;
289 #store any errors found in the parsed file to parse_errors accessor
290 if (scalar(@error_messages) >= 1) {
291 $errors{'error_messages'} = \@error_messages;
292 $self->_set_parse_errors(\%errors);
293 return;
296 return 1; #returns true if validation is passed
300 sub _parse_with_plugin {
301 my $self = shift;
302 my $filename = $self->get_filename();
303 my $schema = $self->get_chado_schema();
305 # Match a dot, extension .xls / .xlsx
306 my ($extension) = $filename =~ /(\.[^.]+)$/;
307 my $parser;
309 if ($extension eq '.xlsx') {
310 $parser = Spreadsheet::ParseXLSX->new();
312 else {
313 $parser = Spreadsheet::ParseExcel->new();
316 my $excel_obj;
317 my $worksheet;
318 my @pedigrees;
319 my %cross_additional_info;
320 my %parsed_result;
322 $excel_obj = $parser->parse($filename);
323 if ( !$excel_obj ) {
324 return;
327 $worksheet = ( $excel_obj->worksheets() )[0];
328 my ( $row_min, $row_max ) = $worksheet->row_range();
329 my ( $col_min, $col_max ) = $worksheet->col_range();
331 for my $row ( 1 .. $row_max ) {
332 my $cross_name;
333 my $cross_combination;
334 my $cross_type;
335 my $female_parent;
336 my $male_parent;
337 my $cross_stock;
339 if ($worksheet->get_cell($row,0)) {
340 $cross_name = $worksheet->get_cell($row,0)->value();
341 $cross_name =~ s/^\s+|\s+$//g;
343 if ($worksheet->get_cell($row,1)) {
344 $cross_combination = $worksheet->get_cell($row,1)->value();
345 $cross_combination =~ s/^\s+|\s+$//g;
347 if ($worksheet->get_cell($row,2)) {
348 $cross_type = $worksheet->get_cell($row,2)->value();
349 $cross_type =~ s/^\s+|\s+$//g;
351 if ($worksheet->get_cell($row,3)) {
352 $female_parent = $worksheet->get_cell($row,3)->value();
353 $female_parent =~ s/^\s+|\s+$//g;
356 if (!defined $cross_name && !defined $cross_type && !defined $female_parent) {
357 last;
360 if ($worksheet->get_cell($row,4)) {
361 $male_parent = $worksheet->get_cell($row,4)->value();
362 $male_parent =~ s/^\s+|\s+$//g;
365 for my $column ( 5 .. $col_max ) {
366 if ($worksheet->get_cell($row,$column)) {
367 my $info_header = $worksheet->get_cell(0,$column)->value();
368 $info_header =~ s/^\s+|\s+$//g;
369 $cross_additional_info{$cross_name}{$info_header} = $worksheet->get_cell($row,$column)->value();
373 my $pedigree = Bio::GeneticRelationships::Pedigree->new(name=>$cross_name, cross_type=>$cross_type, cross_combination=>$cross_combination);
374 if ($female_parent) {
375 my $female_parent_individual = Bio::GeneticRelationships::Individual->new(name => $female_parent);
376 $pedigree->set_female_parent($female_parent_individual);
378 if ($male_parent) {
379 my $male_parent_individual = Bio::GeneticRelationships::Individual->new(name => $male_parent);
380 $pedigree->set_male_parent($male_parent_individual);
383 push @pedigrees, $pedigree;
387 # print STDERR "ADDITIONAL INFO HASH =".Dumper(\%cross_additional_info)."\n";
388 $parsed_result{'additional_info'} = \%cross_additional_info;
390 $parsed_result{'crosses'} = \@pedigrees;
392 $self->_set_parsed_data(\%parsed_result);
394 return 1;
399 sub _get_accession {
400 my $self = shift;
401 my $accession_name = shift;
402 my $chado_schema = $self->get_chado_schema();
403 my $stock_lookup = CXGN::Stock::StockLookup->new(schema => $chado_schema);
404 my $stock;
405 my $accession_cvterm = SGN::Model::Cvterm->get_cvterm_row($chado_schema, 'accession', 'stock_type');
407 $stock_lookup->set_stock_name($accession_name);
408 $stock = $stock_lookup->get_stock_exact();
410 if (!$stock) {
411 return;
414 if ($stock->type_id() != $accession_cvterm->cvterm_id()) {
415 return;
418 return $stock;
423 sub _get_cross {
424 my $self = shift;
425 my $cross_name = shift;
426 my $chado_schema = $self->get_chado_schema();
427 my $stock_lookup = CXGN::Stock::StockLookup->new(schema => $chado_schema);
428 my $stock;
430 $stock_lookup->set_stock_name($cross_name);
431 $stock = $stock_lookup->get_stock_exact();
433 if (!$stock) {
434 return;
437 return $stock;