start fixing test for multi cat phenotype upload.
[sgn.git] / lib / CXGN / Genotype / StoreVCFGenotypes.pm
blob7ac35552593cedb342cef7902e11e0a959f01279
1 package CXGN::Genotype::StoreVCFGenotypes;
3 =head1 NAME
5 CXGN::Genotype::StoreVCFGenotypes - an object to handle storing genotypes in genotypeprop from VCF file
7 =head1 USAGE
9 Genotyping project is a top level project for saving results from related genotyping runs under.
10 Protocol is for saving all the marker info, top header lines, and reference_genome_name for the uploaded file. Many files can be uploaded under the same protocol if the data contained in the separate files is actually the same protocol. If data is separated into many files, but belongs to the same identical protocol, make sure that marker info is identical across different files, however the files can have different sample names.
11 For sample names that contain IGD numbers (with : separation) e.g. ABC:9292:9:c19238, make sure to use the igd_numbers_included flag.
12 For sample names that contain Lab numbers (with . separation) e.g. ABC.A238, make sure to use the lab_numbers_included flag.
14 protocol_info shold be a hashref with the following:
15 notice that the info in the markers and markers_array keys are identical, just in two different representations.
18 'reference_genome_name' => 'Mesculenta_511_v7',
19 'species_name' => 'Manihot esculenta',
20 'header_information_lines' => [
21 '##fileformat=VCFv4.0',
22 '##Tassel=<ID=GenotypeTable,Version=5,Description="Reference allele is not known. The major allele was used as reference allele">',
23 '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">'
25 'sample_observation_unit_type_name' => 'tissue_sample',
26 'marker_names' => ['marker1', 'marker2'],
27 'markers' => {
28 'marker1' => {
29 'name' => 'marker1',
30 'chrom' => '2',
31 'pos' => '20032',
32 'alt' => 'G',
33 'ref' => 'C',
34 'qual' => '99',
35 'filter' => 'PASS',
36 'info' => 'AR2=0.29;DR2=0.342;AF=0.375',
37 'format' => 'GT:AD:DP:GQ:DS:PL:NT'
39 'marker2' => {
40 'name' => 'marker2',
41 'chrom' => '2',
42 'pos' => '20033',
43 'alt' => 'G',
44 'ref' => 'C',
45 'qual' => '99',
46 'filter' => 'PASS',
47 'info' => 'AR2=0.29;DR2=0.342;AF=0.375',
48 'format' => 'GT:AD:DP:GQ:DS:PL:NT'
51 'markers_array' => [
53 'name' => 'marker1',
54 'chrom' => '2',
55 'pos' => '20032',
56 'alt' => 'G',
57 'ref' => 'C',
58 'qual' => '99',
59 'filter' => 'PASS',
60 'info' => 'AR2=0.29;DR2=0.342;AF=0.375',
61 'format' => 'GT:AD:DP:GQ:DS:PL:NT'
64 'name' => 'marker2',
65 'chrom' => '2',
66 'pos' => '20033',
67 'alt' => 'G',
68 'ref' => 'C',
69 'qual' => '99',
70 'filter' => 'PASS',
71 'info' => 'AR2=0.29;DR2=0.342;AF=0.375',
72 'format' => 'GT:AD:DP:GQ:DS:PL:NT'
77 genotype_info shold be a hashref with the following (though the inner object keys are whatever is in the VCF format):
78 notice that the top level keys are all the sample names, and the next keys are marker names.
80 'samplename1' => {
81 'marker1' => {
82 'GT' => '0/0',
83 'AD' => '9,0',
84 'DP' => '9',
85 'GQ' => '99',
86 'DS'=> '0',
87 'PL' => '0,27,255'
88 'NT' => 'G,C',
90 'marker2' => {
91 'GT' => '0/0',
92 'AD' => '9,0',
93 'DP' => '9',
94 'GQ' => '99',
95 'DS'=> '0',
96 'PL' => '0,27,255'
97 'NT' => 'G,C',
100 'samplename2' => {
101 'marker1' => {
102 'GT' => '0/0',
103 'AD' => '9,0',
104 'DP' => '9',
105 'GQ' => '99',
106 'DS'=> '0',
107 'PL' => '0,27,255'
109 'marker2' => {
110 'GT' => '0/0',
111 'AD' => '9,0',
112 'DP' => '9',
113 'GQ' => '99',
114 'DS'=> '0',
115 'PL' => '0,27,255'
116 'NT' => 'G,C',
121 For storing genotyping data from a brand new genotyping project and protocol:
122 my $store_genotypes = CXGN::Genotype::StoreVCFGenotypes->new(
123 bcs_schema=>$schema,
124 metadata_schema=>$metadata_schema,
125 phenome_schema=>$phenome_schema,
126 protocol_info => \%protocol_info,
127 genotype_info => \%genotype_info,
128 observation_unit_uniquenames => \@sample_uniquenames,
129 observation_unit_type_name=>'tissue_sample',
130 breeding_program_id=>101,
131 genotyping_facility=>'IGD',
132 project_year=>'2018',
133 project_location_id=>$location_id,
134 project_name=>'VCF2018',
135 project_description=>'description',
136 protocol_name=>'SNP2018',
137 protocol_description=>'protocol description',
138 organism_id=>$organism_id,
139 user_id => 41,
140 igd_numbers_included=>0,
141 lab_numbers_included=>0,
142 archived_filename => $archived_file,
143 archived_file_type => 'genotype_vcf' #can be 'genotype_vcf' or 'genotype_dosage'
145 my $verified_errors = $store_genotypes->validate();
146 $store_genotypes->store_metadata();
147 $store_genotypes->store_identifiers();
148 $return = $store_genotypes->store_genotypeprop_table();
150 # if genotypes are loaded consecutively, as in the transposed
151 # file, each genotype can be loaded as follows:
153 foreach $genotype (@genotypes) {
154 $store_genotypes->genotype_info($genotype);
155 $store_genotypes->observation_unit_uniquenames(\@observation_unit_uniquenames);
156 my $return = $store_genotypes->store_identifiers();
158 $return = $store_genotypes->store_genotypeprop_table();
160 ---------------------------------------------------------------
162 For storing genotyping data from a new protocol in a previously created genotyping project, use project_id:
163 my $store_genotypes = CXGN::Genotype::StoreVCFGenotypes->new(
164 bcs_schema=>$schema,
165 metadata_schema=>$metadata_schema,
166 phenome_schema=>$phenome_schema,
167 protocol_info => \%protocol_info,
168 genotype_info => \%genotype_info,
169 observation_unit_uniquenames => \@sample_uniquenames,
170 observation_unit_type_name=>'tissue_sample',
171 project_location_id=>$location_id,
172 project_id=>123,
173 protocol_name=>'SNP2018',
174 protocol_description=>'protocol description',
175 organism_id=>$organism_id,
176 user_id => 41,
177 igd_numbers_included=>0,
178 lab_numbers_included=>0,
179 archived_filename => $archived_file,
180 archived_file_type => 'genotype_vcf' #can be 'genotype_vcf' or 'genotype_dosage'
182 my $verified_errors = $store_genotypes->validate();
183 $store_genotypes->store_metadata();
184 $store_genotypes->store_identifiers();
185 $return = $store_genotypes->store_genotypeprop_table();
187 ---------------------------------------------------------------
189 For storing genotyping data from a previously saved protocol in a new project, use protocol_id:
190 my $store_genotypes = CXGN::Genotype::StoreVCFGenotypes->new(
191 bcs_schema=>$schema,
192 metadata_schema=>$metadata_schema,
193 phenome_schema=>$phenome_schema,
194 protocol_info => \%protocol_info,
195 genotype_info => \%genotype_info,
196 observation_unit_uniquenames => \@sample_uniquenames,
197 observation_unit_type_name=>'tissue_sample',
198 breeding_program_id=>101,
199 genotyping_facility=>'IGD',
200 project_year=>'2018',
201 project_location_id=>$location_id,
202 project_name=>'VCF2018',
203 project_description=>'description',
204 protocol_id => 23,
205 organism_id=>$organism_id,
206 user_id => 41,
207 igd_numbers_included=>0,
208 lab_numbers_included=>0,
209 archived_filename => $archived_file,
210 archived_file_type => 'genotype_vcf' #can be 'genotype_vcf' or 'genotype_dosage'
212 my $verified_errors = $store_genotypes->validate();
213 $store_genotypes->store_metadata();
214 $store_genotypes->store_identifiers();
215 $return = $store_genotypes->store_genotypeprop_table();
217 ---------------------------------------------------------------
219 For storing genotying data from a previously saved protocol in a previously saved project, use project_id and protocol_id:
220 my $store_genotypes = CXGN::Genotype::StoreVCFGenotypes->new(
221 bcs_schema=>$schema,
222 metadata_schema=>$metadata_schema,
223 phenome_schema=>$phenome_schema,
224 protocol_info => \%protocol_info,
225 genotype_info => \%genotype_info,
226 observation_unit_uniquenames => \@sample_uniquenames,
227 observation_unit_type_name=>'tissue_sample',
228 project_location_id=>$location_id,
229 project_id=>123,
230 protocol_id=>23,
231 organism_id=>$organism_id,
232 user_id => 41,
233 igd_numbers_included=>0,
234 lab_numbers_included=>0,
235 archived_filename => $archived_file,
236 archived_file_type => 'genotype_vcf' #can be 'genotype_vcf' or 'genotype_dosage'
238 my $verified_errors = $store_genotypes->validate();
239 $store_genotypes->store_metadata();
240 $store_genotypes->store_identifiers();
241 $return = $store_genotypes->store_genotypeprop_table();
243 =head1 DESCRIPTION
246 =head1 AUTHORS
249 =cut
251 use strict;
252 use warnings;
253 use Moose;
254 use Try::Tiny;
255 use File::Basename qw | basename dirname|;
256 use Digest::MD5;
257 use CXGN::List::Validate;
258 use Data::Dumper;
259 use CXGN::UploadFile;
260 use SGN::Model::Cvterm;
261 use JSON;
262 use CXGN::Trial;
263 use Text::CSV;
264 use Hash::Case::Preserve;
265 use Array::Utils qw(:all);
267 has 'bcs_schema' => (
268 isa => 'Bio::Chado::Schema',
269 is => 'rw',
270 required => 1,
273 has 'metadata_schema' => (
274 isa => 'CXGN::Metadata::Schema',
275 is => 'rw',
276 required => 1,
279 has 'phenome_schema' => (
280 isa => 'CXGN::Phenome::Schema',
281 is => 'rw',
282 required => 1,
285 has 'protocol_info' => (
286 isa => 'HashRef',
287 is => 'rw',
288 required => 1
291 has 'genotype_info' => (
292 isa => 'HashRef',
293 is => 'rw',
294 required => 1
297 has 'observation_unit_uniquenames' => (
298 isa => 'ArrayRef',
299 is => 'rw',
300 required => 1
303 has 'observation_unit_type_name' => ( #Can be accession, plot, plant, tissue_sample, or stocks
304 isa => 'Str',
305 is => 'rw',
306 required => 1,
309 has 'breeding_program_id' => (
310 isa => 'Int|Undef',
311 is => 'rw',
314 has 'genotyping_facility' => (
315 isa => 'Str|Undef',
316 is => 'rw',
319 has 'project_id' => (
320 isa => 'Int|Undef',
321 is => 'rw',
324 has 'project_year' => (
325 isa => 'Str|Undef',
326 is => 'rw',
329 has 'project_name' => (
330 isa => 'Str|Undef',
331 is => 'rw',
334 has 'project_description' => (
335 isa => 'Str|Undef',
336 is => 'rw',
339 has 'project_location_id' => (
340 isa => 'Int',
341 is => 'rw',
342 required => 1,
345 has 'protocol_id' => (
346 isa => 'Int|Undef',
347 is => 'rw',
350 has 'protocol_name' => (
351 isa => 'Str|Undef',
352 is => 'rw',
355 has 'protocol_description' => (
356 isa => 'Str|Undef',
357 is => 'rw',
360 has 'organism_id' => (
361 isa => 'Int',
362 is => 'rw',
363 required => 1,
366 has 'accession_population_name' => (
367 isa => 'Str',
368 is => 'rw',
369 required => 0,
372 has 'user_id' => (
373 isa => 'Int',
374 is => 'rw',
375 required => 1,
378 has 'archived_filename' => (
379 isa => 'Str',
380 is => 'rw',
381 required => 0,
384 has 'archived_file_type' => ( #can be 'genotype_vcf' or 'genotype_dosage' to disntiguish genotyprop between old dosage only format and more info vcf format
385 isa => 'Str',
386 is => 'rw',
387 required => 0,
390 has 'igd_numbers_included' => (
391 isa => 'Bool',
392 is => 'rw',
393 default => 0,
396 has 'lab_numbers_included' => (
397 isa => 'Bool',
398 is => 'rw',
399 default => 0,
402 has 'temp_file_sql_copy' => (
403 isa => 'Str',
404 is => 'rw',
405 required => 1
408 has 'geno_cvterm_id' => (
409 isa => 'Int',
410 is => 'rw',
413 has 'stock_type_id' => (
414 isa => 'Int',
415 is => 'rw',
418 has 'synonym_type_id' => (
419 isa => 'Int',
420 is => 'rw',
423 has 'accession_type_id' => (
424 isa => 'Int',
425 is => 'rw',
428 has 'stock_lookup' => (
429 isa => 'HashRef',
430 is => 'rw',
433 has 'genotyping_facility_cvterm' => (
434 isa => 'Ref',
435 is => 'rw',
438 has 'snp_genotypingprop_cvterm_id' => (
439 isa => 'Int',
440 is => 'rw',
443 has 'population_stock_id' => (
444 isa => 'Maybe[Int]',
445 is => 'rw',
448 has 'population_members_id' => (
449 isa => 'Maybe[Int]',
450 is => 'rw',
453 has 'igd_number_cvterm_id' => (
454 isa => 'Int',
455 is => 'rw',
458 has 'population_cvterm_id' => (
459 isa => 'Int',
460 is => 'rw',
463 has 'md_file_id' => (
464 isa => 'Int',
465 is => 'rw',
468 has 'tissue_sample_type_id' => (
469 isa => 'Int',
470 is => 'rw',
473 has 'vcf_genotyping_cvterm_id' => (
474 isa => 'Int|Undef',
475 is => 'rw',
478 has 'vcf_genotyping_type' => (
479 isa => 'Str|Undef',
480 is => 'rw',
483 has 'genotyping_type' => (
484 isa => 'Str|Undef',
485 is => 'rw',
488 has 'vcf_map_details_id' => (
489 isa => 'Int',
490 is => 'rw',
493 has 'vcf_map_details_markers_cvterm_id' => (
494 isa => 'Int',
495 is => 'rw',
498 has 'vcf_map_details_markers_array_cvterm_id' => (
499 isa => 'Int',
500 is => 'rw',
503 has 'design_cvterm' => (
504 isa => 'Ref',
505 is => 'rw',
508 has 'project_year_cvterm' => (
509 isa => 'Ref',
510 is => 'rw',
513 has 'marker_by_marker_storage' => (
514 isa => 'Bool|Undef',
515 is => 'rw'
518 has 'pcr_marker_genotyping_type_id' => (
519 isa => 'Int',
520 is => 'rw',
523 has 'genotyping_data_type' => (
524 isa => 'Str|Undef',
525 is => 'rw',
529 sub BUILD {
530 my $self = shift;
533 sub validate {
534 my $self = shift;
535 my $schema = $self->bcs_schema;
536 my $dbh = $schema->storage->dbh;
537 my $organism_id = $self->organism_id;
538 my $observation_unit_uniquenames = $self->observation_unit_uniquenames;
539 my $protocol_info = $self->protocol_info;
540 my $genotype_info = $self->genotype_info;
541 my $include_igd_numbers = $self->igd_numbers_included;
542 my $include_lab_numbers = $self->lab_numbers_included;
543 my $genotyping_data_type = $self->genotyping_data_type;
544 my $vcf_genotyping_type = $self->vcf_genotyping_type || 'vcf_snp_genotyping';
545 my $genotyping_type = $self->genotyping_type;
547 my @error_messages;
548 my @warning_messages;
550 #to disntiguish genotyprop between old dosage only format and more info vcf format
551 if ($self->archived_file_type && $self->archived_file_type ne 'genotype_vcf' && $self->archived_file_type ne 'genotype_dosage') {
552 push @error_messages, 'Archived filetype must be either genotype_vcf or genotype_dosage';
553 return {error_messages => \@error_messages};
556 #check if sample names are in the database.
557 if (scalar(@$observation_unit_uniquenames) == 0){
558 push @error_messages, "No observtaion_unit_names in file";
561 my $vcf_genotyping_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, $vcf_genotyping_type, 'genotype_property')->cvterm_id();
562 $self->vcf_genotyping_cvterm_id($vcf_genotyping_cvterm_id);
564 my $geno_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'genotyping_experiment', 'experiment_type')->cvterm_id();
565 $self->geno_cvterm_id($geno_cvterm_id);
567 my $pcr_marker_genotypeprop_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'pcr_marker_genotyping', 'genotype_property')->cvterm_id();
568 $self->pcr_marker_genotyping_type_id($pcr_marker_genotypeprop_cvterm_id);
570 my $snp_genotypingprop_cvterm_id;
571 if ($self->archived_file_type eq 'genotype_vcf'){
572 $snp_genotypingprop_cvterm_id = $vcf_genotyping_cvterm_id;
573 } elsif ($self->archived_file_type eq 'genotype_dosage'){
574 $snp_genotypingprop_cvterm_id = $vcf_genotyping_cvterm_id;
577 $self->snp_genotypingprop_cvterm_id($snp_genotypingprop_cvterm_id);
579 #remove extra numbers, such as igd after : symbol
580 my @observation_unit_uniquenames_stripped;
581 foreach (@$observation_unit_uniquenames) {
582 print STDERR "Now dealing with observation unit $_...\n";
583 $_ =~ s/^\s+|\s+$//g;
584 if ($include_igd_numbers){
585 my ($observation_unit_name_with_accession_name, $igd_number) = split(/:/, $_, 2);
586 $observation_unit_name_with_accession_name =~ s/^\s+|\s+$//g;
587 my ($observation_unit_name, $accession_name) = split(/\|\|\|/, $observation_unit_name_with_accession_name);
588 push @observation_unit_uniquenames_stripped, $observation_unit_name;
590 elsif ($include_lab_numbers){
591 my ($observation_unit_name_with_accession_name, $lab_number) = split(/\./, $_, 2);
592 $observation_unit_name_with_accession_name =~ s/^\s+|\s+$//g;
593 my ($observation_unit_name, $accession_name) = split(/\|\|\|/, $observation_unit_name_with_accession_name);
594 push @observation_unit_uniquenames_stripped, $observation_unit_name;
596 else {
597 my ($observation_unit_name, $accession_name) = split(/\|\|\|/, $_);
598 push @observation_unit_uniquenames_stripped, $observation_unit_name;
602 my $synonym_type_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'stock_synonym', 'stock_property')->cvterm_id();
603 $self->synonym_type_id($synonym_type_id);
605 my $stock_type = $self->observation_unit_type_name;
606 my $stock_type_id;
607 my @missing_stocks;
608 my $validator = CXGN::List::Validate->new();
609 if ($stock_type eq 'tissue_sample'){
610 @missing_stocks = @{$validator->validate($schema,'tissue_samples',\@observation_unit_uniquenames_stripped)->{'missing'}};
611 $stock_type_id = SGN::Model::Cvterm->get_cvterm_row($schema, $stock_type, 'stock_type')->cvterm_id();
612 $self->stock_type_id($stock_type_id);
613 } elsif ($stock_type eq 'stocks'){
614 @missing_stocks = @{$validator->validate($schema,'stocks',\@observation_unit_uniquenames_stripped)->{'missing'}};
615 } elsif ($stock_type eq 'tissue_sample_or_accession'){
616 @missing_stocks = @{$validator->validate($schema,'tissue_samples_or_accessions',\@observation_unit_uniquenames_stripped)->{'missing'}};
617 } elsif ($stock_type eq 'accession'){
618 @missing_stocks = @{$validator->validate($schema,'accessions',\@observation_unit_uniquenames_stripped)->{'missing'}};
619 $stock_type_id = SGN::Model::Cvterm->get_cvterm_row($schema, $stock_type, 'stock_type')->cvterm_id();
620 $self->stock_type_id($stock_type_id);
622 my %all_names;
623 my $accession_type_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'accession', 'stock_type')->cvterm_id();
624 $self->accession_type_id($accession_type_id);
625 my $q = "SELECT stock.stock_id, stock.uniquename, stockprop.value, stockprop.type_id FROM stock LEFT JOIN stockprop USING(stock_id) WHERE stock.type_id=$accession_type_id AND stock.is_obsolete = 'F';";
626 my $h = $schema->storage->dbh()->prepare($q);
627 $h->execute();
628 while (my ($stock_id, $uniquename, $synonym, $type_id) = $h->fetchrow_array()) {
629 $all_names{$uniquename}++;
630 if ($type_id) {
631 if ($type_id == $synonym_type_id) {
632 #if (exists($all_names{$synonym})){
633 # my $previous_use = $all_names{$synonym};
634 # push @error_messages, "DATABASE PROBLEM: The synonym $synonym is being used in $previous_use AND $uniquename. PLEASE RESOLVE THIS NOW OR CONTACT US!";
636 #$all_names{$synonym} = $uniquename;
640 } else {
641 push @error_messages, "You can only upload genotype data for a tissue_sample OR accession (including synonyms) OR stocks!"
644 my %unique_stocks;
645 foreach (@missing_stocks){
646 $unique_stocks{$_}++;
649 @missing_stocks = sort keys %unique_stocks;
650 if (scalar(@missing_stocks)>0){
651 push @error_messages, "The following stocks are not in the database: ".join(',',@missing_stocks);
654 #check if protocol_info is correct
655 if ($genotyping_data_type ne 'ssr') {
656 while (my ($chromosome, $protocol_info_chrom) = each %{$protocol_info->{markers}}) {
657 while (my ($marker_name, $marker_info) = each %{$protocol_info_chrom}) {
658 if (!$marker_name || !$marker_info){
659 push @error_messages, "No genotype info provided";
661 foreach (keys %$marker_info){
662 if ($_ ne 'name' && $_ ne 'chrom' && $_ ne 'pos' && $_ ne 'ref' && $_ ne 'alt' && $_ ne 'qual' && $_ ne 'filter' && $_ ne 'info' && $_ ne 'format' && $_ ne 'intertek_name' && $_ ne 'sequence' && $_ ne 'facility_name'){
663 push @error_messages, "protocol_info key not recognized: $_";
666 if(!exists($marker_info->{'name'})){
667 push @error_messages, "protocol_info missing name key";
669 if(!exists($marker_info->{'chrom'})){
670 push @error_messages, "protocol_info missing chrom key";
672 if(!exists($marker_info->{'pos'})){
673 push @error_messages, "protocol_info missing pos key";
675 if(!exists($marker_info->{'ref'})){
676 push @error_messages, "protocol_info missing ref key";
678 if(!exists($marker_info->{'alt'})){
679 push @error_messages, "protocol_info missing alt key";
683 if (scalar(@{$protocol_info->{marker_names}}) == 0){
684 push @error_messages, "No marker info in file";
686 while (my ($chromosome, $protocol_info_chrom) = each %{$protocol_info->{markers_array}}) {
687 if (scalar(@{$protocol_info_chrom}) == 0){
688 push @error_messages, "No marker info in markers_array file";
693 if ($genotyping_data_type eq 'ssr') {
694 my @stock_keys = keys %{$genotype_info};
695 my @marker_keys = sort keys %{$genotype_info->{$stock_keys[0]}};
697 my $protocol_marker_names = $protocol_info->{'marker_names'};
698 my @protocol_marker_array = sort @$protocol_marker_names;
700 my %stored_marker_hash = map {$_=>1} @protocol_marker_array;
701 my @not_stored_markers = grep {!exists $stored_marker_hash{$_}} @marker_keys;
703 if (scalar(@not_stored_markers) > 0) {
704 my $missing_markers = join(",", @not_stored_markers);
705 push @error_messages, "Error: some markers in SSR genotyping data are not in the selected protocol. Missing markers: $missing_markers";
709 #check if genotype_info is correct
710 #print STDERR Dumper($genotype_info);
712 while (my ($observation_unit_name, $marker_result) = each %$genotype_info){
713 if (!$observation_unit_name || !$marker_result){
714 push @error_messages, "No geno info in genotype_info";
718 my $genotype_cvterm_id;
720 if ($genotyping_data_type eq 'ssr') {
721 $genotype_cvterm_id = $pcr_marker_genotypeprop_cvterm_id;
722 } else {
723 $genotype_cvterm_id = $vcf_genotyping_cvterm_id;
726 my $previous_genotypes_search_params = {
727 'me.uniquename' => {-in => \@observation_unit_uniquenames_stripped},
728 'nd_experiment.type_id' => $geno_cvterm_id,
729 'genotype.type_id' => $genotype_cvterm_id
732 if ($stock_type_id) {
733 $previous_genotypes_search_params->{'me.type_id'} = $stock_type_id;
736 my $previous_genotypes_exist;
737 my $previous_genotypes_rs = $schema->resultset("Stock::Stock")->search($previous_genotypes_search_params, {
738 join => {'nd_experiment_stocks' => {'nd_experiment' => [ {'nd_experiment_genotypes' => 'genotype'}, {'nd_experiment_protocols' => 'nd_protocol'}, {'nd_experiment_projects' => 'project'} ] } },
739 '+select' => ['nd_protocol.nd_protocol_id', 'nd_protocol.name', 'project.project_id', 'project.name'],
740 '+as' => ['protocol_id', 'protocol_name', 'project_id', 'project_name'],
741 order_by => 'genotype.genotype_id'
743 while(my $r = $previous_genotypes_rs->next){
744 print STDERR "PREVIOUS GENOTYPES ".join (",", ($r->get_column('uniquename'), $r->get_column('protocol_name'), $r->get_column('project_name')))."\n";
745 my $uniquename = $r->uniquename;
746 my $protocol_name = $r->get_column('protocol_name');
747 my $project_name = $r->get_column('project_name');
748 push @warning_messages, "$uniquename in your file has already has genotype stored using the protocol $protocol_name in the project $project_name.";
749 $previous_genotypes_exist = 1;
752 return {
753 error_messages => \@error_messages,
754 warning_messages => \@warning_messages,
755 missing_stocks => \@missing_stocks,
756 previous_genotypes_exist => $previous_genotypes_exist
761 sub store_metadata {
762 my $self = shift;
763 print STDERR "Genotype VCF metadata store\n";
765 my $schema = $self->bcs_schema;
766 my $dbh = $schema->storage->dbh;
767 my $genotype_facility = $self->genotyping_facility;
768 my $opt_p = $self->project_name;
769 my $project_description = $self->project_description;
770 my $opt_y = $self->project_year;
771 my $map_protocol_name = $self->protocol_name;
772 my $map_protocol_description = $self->protocol_description;
773 my $location_id = $self->project_location_id;
774 my $igd_numbers_included = $self->igd_numbers_included;
775 my $lab_numbers_included = $self->lab_numbers_included;
776 my $stock_type = $self->observation_unit_type_name;
777 my $organism_id = $self->organism_id;
778 my $observation_unit_uniquenames = $self->observation_unit_uniquenames;
779 my $genotyping_data_type = $self->genotyping_data_type;
780 my $vcf_genotyping_type = $self->vcf_genotyping_type || 'vcf_snp_genotyping';
781 my $genotyping_type = $self->genotyping_type;
783 $dbh->do('SET search_path TO public,sgn');
785 my $population_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'population', 'stock_type')->cvterm_id();
786 $self->population_cvterm_id($population_cvterm_id);
788 my $igd_number_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'igd number', 'genotype_property')->cvterm_id();
789 $self->igd_number_cvterm_id($igd_number_cvterm_id);
791 my $vcf_genotyping_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, $vcf_genotyping_type, 'genotype_property')->cvterm_id();
793 $self->vcf_genotyping_cvterm_id($vcf_genotyping_cvterm_id);
795 my $geno_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'genotyping_experiment', 'experiment_type')->cvterm_id();
796 $self->geno_cvterm_id($geno_cvterm_id);
798 my $pcr_marker_genotyping_type_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'pcr_marker_genotyping', 'genotype_property')->cvterm_id();
799 $self->pcr_marker_genotyping_type_id($pcr_marker_genotyping_type_id);
801 my $vcf_map_details_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'vcf_map_details', 'protocol_property')->cvterm_id();
802 $self->vcf_map_details_id($vcf_map_details_id);
804 my $vcf_map_details_markers_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'vcf_map_details_markers', 'protocol_property')->cvterm_id();
805 $self->vcf_map_details_markers_cvterm_id($vcf_map_details_markers_cvterm_id);
807 my $vcf_map_details_markers_array_cvterm_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'vcf_map_details_markers_array', 'protocol_property')->cvterm_id();
808 $self->vcf_map_details_markers_array_cvterm_id($vcf_map_details_markers_array_cvterm_id);
810 my $population_members_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'member_of', 'stock_relationship')->cvterm_id();
811 $self->population_members_id($population_members_id);
813 my $design_cvterm = SGN::Model::Cvterm->get_cvterm_row($schema, 'design', 'project_property');
814 $self->design_cvterm($design_cvterm);
816 my $project_year_cvterm = SGN::Model::Cvterm->get_cvterm_row($schema, 'project year', 'project_property');
817 $self->project_year_cvterm($project_year_cvterm);
819 my $genotyping_facility_cvterm = SGN::Model::Cvterm->get_cvterm_row($schema, 'genotyping_facility', 'project_property');
820 $self->genotyping_facility_cvterm($genotyping_facility_cvterm);
822 #when project_id is provided, a new project is not created
823 my $project_id;
824 my $project_check = $schema->resultset("Project::Project")->find({
825 project_id => $self->project_id
827 if ($project_check){
828 $project_id = $project_check->project_id;
829 } else {
830 my $project = $schema->resultset("Project::Project")->find_or_create({
831 name => $opt_p,
832 description => $project_description,
834 $project_id = $project->project_id();
835 $project->create_projectprops( { $project_year_cvterm->name() => $opt_y } );
836 $project->create_projectprops( { $self->genotyping_facility_cvterm()->name() => $genotype_facility } );
837 if ($genotyping_data_type eq 'ssr') {
838 $project->create_projectprops( { $self->design_cvterm->name() => 'pcr_genotype_data_project' } );
839 } else {
840 $project->create_projectprops( { $self->design_cvterm->name() => 'genotype_data_project' } );
843 my $t = CXGN::Trial->new({
844 bcs_schema => $schema,
845 trial_id => $project_id
847 $t->set_breeding_program($self->breeding_program_id);
848 $self->project_location_id($location_id);
851 $self->project_id($project_id);
853 #if population name given, so we can add genotyped samples to population
854 my $population_stock;
855 my $population_stock_id;
856 if ($self->accession_population_name){
857 $population_stock = $schema->resultset("Stock::Stock")->find_or_create({
858 organism_id => $organism_id,
859 name => $self->accession_population_name,
860 uniquename => $self->accession_population_name,
861 type_id => $self->population_cvterm_id(),
863 $population_stock_id = $population_stock->stock_id();
865 $self->population_stock_id($population_stock_id);
867 #When protocol_id provided, a new protocol is not created
868 my $protocol_id;
869 my $protocol_row_check = $schema->resultset("NaturalDiversity::NdProtocol")->find({
870 nd_protocol_id => $self->protocol_id
872 if ($protocol_row_check){
873 $protocol_id = $protocol_row_check->nd_protocol_id;
874 } else {
875 my $protocol_row = $schema->resultset("NaturalDiversity::NdProtocol")->find_or_create({
876 name => $map_protocol_name,
877 type_id => $geno_cvterm_id
879 $protocol_id = $protocol_row->nd_protocol_id();
881 my $q = "UPDATE nd_protocol SET description = ? WHERE nd_protocol_id = ?;";
882 my $h = $schema->storage->dbh()->prepare($q);
883 $h->execute($map_protocol_description, $protocol_id);
885 my $new_protocol_info = $self->protocol_info;
886 my $nd_protocolprop_markers = $new_protocol_info->{markers};
887 my $nd_protocolprop_markers_array = $new_protocol_info->{markers_array};
889 my %unique_chromosomes;
890 while (my ($chromosome, $protocol_info_chrom) = each %{$nd_protocolprop_markers_array}) {
891 print STDERR "getting count for chrom $chromosome\n";
892 foreach (@$protocol_info_chrom) {
893 $unique_chromosomes{$_->{chrom}}++;
896 my %chromosomes;
897 my $chr_count = 0;
898 foreach my $chr_name (sort keys %unique_chromosomes) {
899 my $marker_count = $unique_chromosomes{$chr_name};
900 $chromosomes{$chr_name} = {
901 rank => $chr_count,
902 marker_count => $marker_count
904 $chr_count++;
906 print STDERR Dumper \%chromosomes;
907 $new_protocol_info->{chromosomes} = \%chromosomes;
909 delete($new_protocol_info->{markers});
910 delete($new_protocol_info->{markers_array});
912 my $nd_protocol_json_string = encode_json $new_protocol_info;
913 my $new_protocolprop_sql = "INSERT INTO nd_protocolprop (nd_protocol_id, type_id, rank, value) VALUES (?, ?, ?, ?);";
914 my $h_protocolprop = $schema->storage->dbh()->prepare($new_protocolprop_sql);
915 $h_protocolprop->execute($protocol_id, $vcf_map_details_id, 0, $nd_protocol_json_string);
917 foreach my $chr_name (sort keys %unique_chromosomes) {
918 # print STDERR "Chromosome: $chr_name\n";
919 my $rank = $chromosomes{$chr_name}->{rank};
920 my $nd_protocolprop_markers_json_string = encode_json $nd_protocolprop_markers->{$chr_name};
921 my $nd_protocolprop_markers_array_json_string = encode_json $nd_protocolprop_markers_array->{$chr_name};
922 $h_protocolprop->execute($protocol_id, $vcf_map_details_markers_cvterm_id, $rank, $nd_protocolprop_markers_json_string);
923 $h_protocolprop->execute($protocol_id, $vcf_map_details_markers_array_cvterm_id, $rank, $nd_protocolprop_markers_array_json_string);
925 print STDERR "Protocolprop stored...\n";
928 $self->protocol_id($protocol_id);
930 my $accession_type_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'accession', 'stock_type')->cvterm_id();
931 $self->accession_type_id($accession_type_id);
932 my $tissue_sample_type_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'tissue_sample', 'stock_type')->cvterm_id();
933 $self->tissue_sample_type_id($tissue_sample_type_id);
935 print STDERR "Generating stock synonym lookup table...\n";
936 tie my (%stock_lookup), 'Hash::Case::Preserve';
937 my %all_names;
938 my $stock_lookup_where = '';
939 if ($self->stock_type_id) {
940 $stock_lookup_where = 'stock.type_id IN ('.$self->accession_type_id().','.$self->tissue_sample_type_id().') AND';
942 my $q = "SELECT stock.stock_id, stock.uniquename, stockprop.value, stockprop.type_id FROM stock LEFT JOIN stockprop USING(stock_id) WHERE $stock_lookup_where stock.is_obsolete = 'F';";
943 my $h = $schema->storage->dbh()->prepare($q);
944 $h->execute();
946 while (my ($stock_id, $uniquename, $synonym, $type_id) = $h->fetchrow_array()) {
947 $stock_lookup{$uniquename} = { stock_id => $stock_id };
948 if ($type_id && $self->synonym_type_id() && ($type_id == $self->synonym_type_id())) {
949 $stock_lookup{$synonym} = { stock_id => $stock_id };
953 # Updates stock_lookup to have the genotypeprop_ids for samples previously saved in this protocol/project. Useful for when appending genotypes to the jsonb
954 my $q_g = "SELECT stock.stock_id, stock.uniquename, stockprop.value, stockprop.type_id, nd_experiment_genotype.genotype_id, genotypeprop.genotypeprop_id, genotypeprop.rank
955 FROM stock
956 LEFT JOIN stockprop USING(stock_id)
957 JOIN nd_experiment_stock USING(stock_id)
958 JOIN nd_experiment_genotype USING(nd_experiment_id)
959 JOIN genotypeprop ON(genotypeprop.genotype_id=nd_experiment_genotype.genotype_id AND genotypeprop.type_id=".$self->vcf_genotyping_cvterm_id.")
960 JOIN nd_experiment_protocol USING(nd_experiment_id)
961 JOIN nd_experiment_project USING(nd_experiment_id)
962 WHERE $stock_lookup_where stock.is_obsolete = 'F' AND nd_protocol_id=$protocol_id AND project_id=$project_id;";
963 my $q_g_h = $schema->storage->dbh()->prepare($q_g);
964 $q_g_h->execute();
965 while (my ($stock_id, $uniquename, $synonym, $type_id, $genotype_id, $genotypeprop_id, $chromosome_counter) = $q_g_h->fetchrow_array()) {
966 $stock_lookup{$uniquename} = { stock_id => $stock_id, genotype_id => $genotype_id, chrom => {$chromosome_counter => $genotypeprop_id} };
967 if ($type_id && $type_id == $self->synonym_type_id()) {
968 $stock_lookup{$synonym} = { stock_id => $stock_id, genotype_id => $genotype_id, chrom => {$chromosome_counter => $genotypeprop_id} };
971 $self->stock_lookup(\%stock_lookup);
972 # print STDERR "Generated lookup table with ".scalar(keys(%stock_lookup))." entries.\n";
974 print STDERR "Generating md_file entry...\n";
975 #create relationship between nd_experiment and originating archived file
976 my $file = $self->archived_filename;
977 my $md_row = $self->metadata_schema->resultset("MdMetadata")->create({create_person_id => $self->user_id});
978 $md_row->insert();
979 my $upload_file = CXGN::UploadFile->new();
980 my $md5 = $upload_file->get_md5($file);
981 my $md5checksum = $md5->hexdigest();
982 my $file_row = $self->metadata_schema->resultset("MdFiles")->create({
983 basename => basename($file),
984 dirname => dirname($file),
985 filetype => $self->archived_file_type,
986 md5checksum => $md5checksum,
987 metadata_id => $md_row->metadata_id(),
989 $self->md_file_id($file_row->file_id());
990 print STDERR "md_file_id is ".$self->md_file_id()."\n";
993 sub store_identifiers {
994 my $self = shift;
995 my $schema = $self->bcs_schema;
996 my $dbh = $schema->storage->dbh;
997 my $temp_file_sql_copy = $self->temp_file_sql_copy;
998 my $genotyping_data_type = $self->genotyping_data_type;
1000 my $csv = Text::CSV->new({ binary => 1, auto_diag => 1, eol => "\n"});
1001 open(my $fh, ">>", $temp_file_sql_copy) or die "Failed to open file $temp_file_sql_copy: $!";
1003 my $genotypeprop_observation_units = $self->genotype_info;
1005 my $new_genotypeprop_sql = "INSERT INTO genotypeprop (genotype_id, type_id, rank, value) VALUES (?, ?, ?, ?) RETURNING genotypeprop_id;";
1006 my $h_new_genotypeprop = $schema->storage->dbh()->prepare($new_genotypeprop_sql);
1008 #Preparing insertion of new genotypes. Will insert/update marker genotype score into genotypeprop jsonb. Only Used when loading standard VCF (non-transposed)
1009 my $update_genotypeprop_sql = "UPDATE genotypeprop SET value = (CASE
1010 WHEN value->? IS NULL
1011 THEN jsonb_insert(value, ?, ?::jsonb)
1012 WHEN value->? IS NOT NULL
1013 THEN jsonb_set(value, ?, ?::jsonb)
1014 END) WHERE genotypeprop_id = ?;";
1015 my $h_genotypeprop = $schema->storage->dbh()->prepare($update_genotypeprop_sql);
1017 my %nd_experiment_ids;
1018 my $stock_relationship_schema = $schema->resultset("Stock::StockRelationship");
1019 my $nd_experiment_schema = $schema->resultset('NaturalDiversity::NdExperiment');
1020 my $genotype_schema = $schema->resultset("Genetic::Genotype");
1021 my $genotypeprop_schema = $schema->resultset("Genetic::Genotypeprop");
1023 my $observation_unit_uniquenames = $self->observation_unit_uniquenames();
1024 foreach (@$observation_unit_uniquenames) {
1025 $_ =~ s/^\s+|\s+$//g;
1026 # print STDERR "EACH NAME =".Dumper($_)."\n";
1027 my $observation_unit_name_with_accession_name;
1028 my $observation_unit_name;
1029 my $accession_name;
1030 my $igd_number;
1031 my $lab_number;
1032 if ($self->igd_numbers_included()){
1033 ($observation_unit_name_with_accession_name, $igd_number) = split(/:/, $_, 2);
1034 $observation_unit_name_with_accession_name =~ s/^\s+|\s+$//g;
1035 ($observation_unit_name, $accession_name) = split(/\|\|\|/, $observation_unit_name_with_accession_name);
1036 } elsif ($self->lab_numbers_included()) {
1037 ($observation_unit_name_with_accession_name, $lab_number) = split(/\./, $_, 2);
1038 $observation_unit_name_with_accession_name =~ s/^\s+|\s+$//g;
1039 ($observation_unit_name, $accession_name) = split(/\|\|\|/, $observation_unit_name_with_accession_name);
1040 } else {
1041 ($observation_unit_name, $accession_name) = split(/\|\|\|/, $_);
1043 # print STDERR "OBSERVATION UNIT NAME =".Dumper($observation_unit_name)."\n";
1044 #print STDERR "SAVING GENOTYPEPROP FOR $observation_unit_name \n";
1045 my $stock_lookup_obj = $self->stock_lookup()->{$observation_unit_name};
1046 my $stock_id = $stock_lookup_obj->{stock_id};
1047 my $genotype_id = $stock_lookup_obj->{genotype_id};
1048 # print STDERR "STOCK ID =".Dumper($stock_id)."\n";
1049 my $genotypeprop_json = $genotypeprop_observation_units->{$_};
1050 # print STDERR "GENOTYPEPROP JSON =".Dumper($genotypeprop_json)."\n";
1051 if ($genotypeprop_json) {
1053 if ($self->accession_population_name){
1054 my $pop_rs = $stock_relationship_schema->find_or_create({
1055 type_id => $self->population_members_id(),
1056 subject_id => $stock_id,
1057 object_id => $self->population_stock_id(),
1061 if ( !$self->marker_by_marker_storage || (!$genotype_id && $self->marker_by_marker_storage) ) {
1062 my $experiment = $nd_experiment_schema->create({
1063 nd_geolocation_id => $self->project_location_id(),
1064 type_id => $self->geno_cvterm_id(),
1065 nd_experiment_projects => [ {project_id => $self->project_id()} ],
1066 nd_experiment_stocks => [ {stock_id => $stock_id, type_id => $self->geno_cvterm_id() } ],
1067 nd_experiment_protocols => [ {nd_protocol_id => $self->protocol_id()} ]
1069 my $nd_experiment_id = $experiment->nd_experiment_id();
1070 my $genotype_description;
1071 my $genotype_type_id;
1072 if ($genotyping_data_type eq 'ssr') {
1073 $genotype_description = "SSR genotypes for stock " . "(name = " . $observation_unit_name . ", id = " . $stock_id . ")";
1074 $genotype_type_id = $self->pcr_marker_genotyping_type_id();
1075 } else {
1076 $genotype_description = "$genotyping_data_type genotypes for stock " . "(name = " . $observation_unit_name . ", id = " . $stock_id . ")";
1077 $genotype_type_id = $self->vcf_genotyping_cvterm_id();
1080 my $genotype = $genotype_schema->create({
1081 name => $observation_unit_name . "|" . $nd_experiment_id,
1082 uniquename => $observation_unit_name . "|" . $nd_experiment_id,
1083 description => $genotype_description,
1084 type_id => $genotype_type_id,
1086 $genotype_id = $genotype->genotype_id();
1087 my $nd_experiment_genotype = $experiment->create_related('nd_experiment_genotypes', { genotype_id => $genotype_id } );
1089 #Store IGD number if the option is given.
1090 if ($self->igd_numbers_included()) {
1091 my $add_genotypeprop = $genotypeprop_schema->create({ genotype_id => $genotype_id, type_id => $self->igd_number_cvterm_id(), value => encode_json {'igd_number' => $igd_number} });
1094 $self->stock_lookup()->{$observation_unit_name}->{genotype_id} = $genotype_id;
1096 $nd_experiment_ids{$nd_experiment_id}++;
1099 my $chromosome_counter = 0;
1101 if ($genotyping_data_type eq 'ssr') {
1102 my $json_string = encode_json $genotypeprop_json;
1103 $h_new_genotypeprop->execute($genotype_id, $self->pcr_marker_genotyping_type_id(), $chromosome_counter, $json_string);
1104 my $genotypeprop_id = $h_new_genotypeprop->fetchrow_array();
1105 # print STDERR "GENOTYPEPROP ID =".Dumper($genotypeprop_id)."\n";
1106 } else {
1107 foreach my $chromosome (sort keys %$genotypeprop_json) {
1108 my $genotypeprop_id = $stock_lookup_obj->{chrom}->{$chromosome_counter};
1110 my $chrom_genotypeprop = $genotypeprop_json->{$chromosome};
1112 if ( (!$genotypeprop_id && $self->marker_by_marker_storage) || !$self->marker_by_marker_storage ) {
1114 $chrom_genotypeprop->{CHROM} = $chromosome;
1116 my $json_string = encode_json $chrom_genotypeprop;
1117 if ($self->marker_by_marker_storage) { #Used when standard VCF is being stored (NOTE VCF is transpoed prior to parsing by default now), where genotype scores are appended into jsonb.
1118 $h_new_genotypeprop->execute($genotype_id, $self->snp_genotypingprop_cvterm_id(), $chromosome_counter, $json_string);
1119 my ($genotypeprop_id) = $h_new_genotypeprop->fetchrow_array();
1120 $self->stock_lookup()->{$observation_unit_name}->{chrom}->{$chromosome_counter} = $genotypeprop_id;
1122 else { #Used when transpoed VCF is being stored, or when Intertek files being stored
1123 $csv->print($fh, [ $genotype_id, $self->snp_genotypingprop_cvterm_id(), $chromosome_counter, $json_string ]);
1126 else { #When storing standard VCF, when genotype scores are appended into jsonb one by one. NOTE VCF is transposed by default now prior to parsing, but NOT transposing is relevant for instances where transposing requires too much memory.
1127 while (my ($m, $v) = each %$chrom_genotypeprop) {
1128 my $v_string = encode_json $v;
1129 $h_genotypeprop->execute($m, '{'.$m.'}', $v_string, $m, '{'.$m.'}', $v_string, $genotypeprop_id);
1133 $chromosome_counter++;
1138 close($fh);
1140 foreach my $nd_experiment_id (keys %nd_experiment_ids) {
1141 my $experiment_files = $self->phenome_schema->resultset("NdExperimentMdFiles")->create({
1142 nd_experiment_id => $nd_experiment_id,
1143 file_id => $self->md_file_id(),
1147 my %response = (
1148 success => 1,
1149 nd_protocol_id => $self->protocol_id(),
1150 project_id => $self->project_id(),
1153 return \%response;
1156 sub store_genotypeprop_table {
1157 my $self = shift;
1158 my $temp_file_sql_copy = $self->temp_file_sql_copy;
1159 my $dbh = $self->bcs_schema->storage->dbh;
1161 my $SQL = "COPY genotypeprop (genotype_id, type_id, rank, value) FROM STDIN WITH DELIMITER ',' CSV";
1162 my $sth = $dbh->do($SQL);
1164 print STDERR "SQL COPY $temp_file_sql_copy\n";
1166 open(my $infile, "<", $temp_file_sql_copy) or die "Failed to open file in store_genotypeprop_table() $temp_file_sql_copy: $!";
1167 while (my $line = <$infile>) {
1168 $dbh->pg_putcopydata($line);
1170 $dbh->pg_putcopyend();
1171 close($infile);
1173 my %response = (
1174 success => 1,
1175 nd_protocol_id => $self->protocol_id(),
1176 project_id => $self->project_id(),
1179 return \%response;