1 package CXGN
::Genotype
::StoreVCFGenotypes
;
5 CXGN::Genotype::StoreVCFGenotypes - an object to handle storing genotypes in genotypeprop from VCF file
9 Genotyping project is a top level project for saving results from related genotyping runs under.
10 Protocol is for saving all the marker info, top header lines, and reference_genome_name for the uploaded file. Many files can be uploaded under the same protocol if the data contained in the separate files is actually the same protocol. If data is separated into many files, but belongs to the same identical protocol, make sure that marker info is identical across different files, however the files can have different sample names.
11 For sample names that contain IGD numbers (with : separation) e.g. ABC:9292:9:c19238, make sure to use the igd_numbers_included flag.
12 For sample names that contain Lab numbers (with . separation) e.g. ABC.A238, make sure to use the lab_numbers_included flag.
14 protocol_info shold be a hashref with the following:
15 notice that the info in the markers and markers_array keys are identical, just in two different representations.
18 'reference_genome_name' => 'Mesculenta_511_v7',
19 'species_name' => 'Manihot esculenta',
20 'header_information_lines' => [
21 '##fileformat=VCFv4.0',
22 '##Tassel=<ID=GenotypeTable,Version=5,Description="Reference allele is not known. The major allele was used as reference allele">',
23 '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">'
25 'sample_observation_unit_type_name' => 'tissue_sample',
26 'marker_names' => ['marker1', 'marker2'],
36 'info' => 'AR2=0.29;DR2=0.342;AF=0.375',
37 'format' => 'GT:AD:DP:GQ:DS:PL:NT'
47 'info' => 'AR2=0.29;DR2=0.342;AF=0.375',
48 'format' => 'GT:AD:DP:GQ:DS:PL:NT'
60 'info' => 'AR2=0.29;DR2=0.342;AF=0.375',
61 'format' => 'GT:AD:DP:GQ:DS:PL:NT'
71 'info' => 'AR2=0.29;DR2=0.342;AF=0.375',
72 'format' => 'GT:AD:DP:GQ:DS:PL:NT'
77 genotype_info shold be a hashref with the following (though the inner object keys are whatever is in the VCF format):
78 notice that the top level keys are all the sample names, and the next keys are marker names.
121 For storing genotyping data from a brand new genotyping project and protocol:
122 my $store_genotypes = CXGN::Genotype::StoreVCFGenotypes->new(
124 metadata_schema=>$metadata_schema,
125 phenome_schema=>$phenome_schema,
126 protocol_info => \%protocol_info,
127 genotype_info => \%genotype_info,
128 observation_unit_uniquenames => \@sample_uniquenames,
129 observation_unit_type_name=>'tissue_sample',
130 breeding_program_id=>101,
131 genotyping_facility=>'IGD',
132 project_year=>'2018',
133 project_location_id=>$location_id,
134 project_name=>'VCF2018',
135 project_description=>'description',
136 protocol_name=>'SNP2018',
137 protocol_description=>'protocol description',
138 organism_id=>$organism_id,
140 igd_numbers_included=>0,
141 lab_numbers_included=>0,
142 archived_filename => $archived_file,
143 archived_file_type => 'genotype_vcf' #can be 'genotype_vcf' or 'genotype_dosage'
145 my $verified_errors = $store_genotypes->validate();
146 $store_genotypes->store_metadata();
147 $store_genotypes->store_identifiers();
148 $return = $store_genotypes->store_genotypeprop_table();
150 # if genotypes are loaded consecutively, as in the transposed
151 # file, each genotype can be loaded as follows:
153 foreach $genotype (@genotypes) {
154 $store_genotypes->genotype_info($genotype);
155 $store_genotypes->observation_unit_uniquenames(\@observation_unit_uniquenames);
156 my $return = $store_genotypes->store_identifiers();
158 $return = $store_genotypes->store_genotypeprop_table();
160 ---------------------------------------------------------------
162 For storing genotyping data from a new protocol in a previously created genotyping project, use project_id:
163 my $store_genotypes = CXGN::Genotype::StoreVCFGenotypes->new(
165 metadata_schema=>$metadata_schema,
166 phenome_schema=>$phenome_schema,
167 protocol_info => \%protocol_info,
168 genotype_info => \%genotype_info,
169 observation_unit_uniquenames => \@sample_uniquenames,
170 observation_unit_type_name=>'tissue_sample',
171 project_location_id=>$location_id,
173 protocol_name=>'SNP2018',
174 protocol_description=>'protocol description',
175 organism_id=>$organism_id,
177 igd_numbers_included=>0,
178 lab_numbers_included=>0,
179 archived_filename => $archived_file,
180 archived_file_type => 'genotype_vcf' #can be 'genotype_vcf' or 'genotype_dosage'
182 my $verified_errors = $store_genotypes->validate();
183 $store_genotypes->store_metadata();
184 $store_genotypes->store_identifiers();
185 $return = $store_genotypes->store_genotypeprop_table();
187 ---------------------------------------------------------------
189 For storing genotyping data from a previously saved protocol in a new project, use protocol_id:
190 my $store_genotypes = CXGN::Genotype::StoreVCFGenotypes->new(
192 metadata_schema=>$metadata_schema,
193 phenome_schema=>$phenome_schema,
194 protocol_info => \%protocol_info,
195 genotype_info => \%genotype_info,
196 observation_unit_uniquenames => \@sample_uniquenames,
197 observation_unit_type_name=>'tissue_sample',
198 breeding_program_id=>101,
199 genotyping_facility=>'IGD',
200 project_year=>'2018',
201 project_location_id=>$location_id,
202 project_name=>'VCF2018',
203 project_description=>'description',
205 organism_id=>$organism_id,
207 igd_numbers_included=>0,
208 lab_numbers_included=>0,
209 archived_filename => $archived_file,
210 archived_file_type => 'genotype_vcf' #can be 'genotype_vcf' or 'genotype_dosage'
212 my $verified_errors = $store_genotypes->validate();
213 $store_genotypes->store_metadata();
214 $store_genotypes->store_identifiers();
215 $return = $store_genotypes->store_genotypeprop_table();
217 ---------------------------------------------------------------
219 For storing genotying data from a previously saved protocol in a previously saved project, use project_id and protocol_id:
220 my $store_genotypes = CXGN::Genotype::StoreVCFGenotypes->new(
222 metadata_schema=>$metadata_schema,
223 phenome_schema=>$phenome_schema,
224 protocol_info => \%protocol_info,
225 genotype_info => \%genotype_info,
226 observation_unit_uniquenames => \@sample_uniquenames,
227 observation_unit_type_name=>'tissue_sample',
228 project_location_id=>$location_id,
231 organism_id=>$organism_id,
233 igd_numbers_included=>0,
234 lab_numbers_included=>0,
235 archived_filename => $archived_file,
236 archived_file_type => 'genotype_vcf' #can be 'genotype_vcf' or 'genotype_dosage'
238 my $verified_errors = $store_genotypes->validate();
239 $store_genotypes->store_metadata();
240 $store_genotypes->store_identifiers();
241 $return = $store_genotypes->store_genotypeprop_table();
255 use File
::Basename qw
| basename dirname
|;
257 use CXGN
::List
::Validate
;
259 use CXGN
::UploadFile
;
260 use SGN
::Model
::Cvterm
;
264 use Hash
::Case
::Preserve
;
265 use Array
::Utils
qw(:all);
267 has
'bcs_schema' => (
268 isa
=> 'Bio::Chado::Schema',
273 has
'metadata_schema' => (
274 isa
=> 'CXGN::Metadata::Schema',
279 has
'phenome_schema' => (
280 isa
=> 'CXGN::Phenome::Schema',
285 has
'protocol_info' => (
291 has
'genotype_info' => (
297 has
'observation_unit_uniquenames' => (
303 has
'observation_unit_type_name' => ( #Can be accession, plot, plant, tissue_sample, or stocks
309 has
'breeding_program_id' => (
314 has
'genotyping_facility' => (
319 has
'project_id' => (
324 has
'project_year' => (
329 has
'project_name' => (
334 has
'project_description' => (
339 has
'project_location_id' => (
345 has
'protocol_id' => (
350 has
'protocol_name' => (
355 has
'protocol_description' => (
360 has
'organism_id' => (
366 has
'accession_population_name' => (
378 has
'archived_filename' => (
384 has
'archived_file_type' => ( #can be 'genotype_vcf' or 'genotype_dosage' to disntiguish genotyprop between old dosage only format and more info vcf format
390 has
'igd_numbers_included' => (
396 has
'lab_numbers_included' => (
402 has
'temp_file_sql_copy' => (
408 has
'geno_cvterm_id' => (
413 has
'stock_type_id' => (
418 has
'synonym_type_id' => (
423 has
'accession_type_id' => (
428 has
'stock_lookup' => (
433 has
'genotyping_facility_cvterm' => (
438 has
'snp_genotypingprop_cvterm_id' => (
443 has
'population_stock_id' => (
448 has
'population_members_id' => (
453 has
'igd_number_cvterm_id' => (
458 has
'population_cvterm_id' => (
463 has
'md_file_id' => (
468 has
'tissue_sample_type_id' => (
473 has
'vcf_genotyping_cvterm_id' => (
478 has
'vcf_genotyping_type' => (
483 has
'genotyping_type' => (
488 has
'vcf_map_details_id' => (
493 has
'vcf_map_details_markers_cvterm_id' => (
498 has
'vcf_map_details_markers_array_cvterm_id' => (
503 has
'design_cvterm' => (
508 has
'project_year_cvterm' => (
513 has
'marker_by_marker_storage' => (
518 has
'pcr_marker_genotyping_type_id' => (
523 has
'genotyping_data_type' => (
535 my $schema = $self->bcs_schema;
536 my $dbh = $schema->storage->dbh;
537 my $organism_id = $self->organism_id;
538 my $observation_unit_uniquenames = $self->observation_unit_uniquenames;
539 my $protocol_info = $self->protocol_info;
540 my $genotype_info = $self->genotype_info;
541 my $include_igd_numbers = $self->igd_numbers_included;
542 my $include_lab_numbers = $self->lab_numbers_included;
543 my $genotyping_data_type = $self->genotyping_data_type;
544 my $vcf_genotyping_type = $self->vcf_genotyping_type || 'vcf_snp_genotyping';
545 my $genotyping_type = $self->genotyping_type;
548 my @warning_messages;
550 #to disntiguish genotyprop between old dosage only format and more info vcf format
551 if ($self->archived_file_type && $self->archived_file_type ne 'genotype_vcf' && $self->archived_file_type ne 'genotype_dosage') {
552 push @error_messages, 'Archived filetype must be either genotype_vcf or genotype_dosage';
553 return {error_messages
=> \
@error_messages};
556 #check if sample names are in the database.
557 if (scalar(@
$observation_unit_uniquenames) == 0){
558 push @error_messages, "No observtaion_unit_names in file";
561 my $vcf_genotyping_cvterm_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, $vcf_genotyping_type, 'genotype_property')->cvterm_id();
562 $self->vcf_genotyping_cvterm_id($vcf_genotyping_cvterm_id);
564 my $geno_cvterm_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'genotyping_experiment', 'experiment_type')->cvterm_id();
565 $self->geno_cvterm_id($geno_cvterm_id);
567 my $pcr_marker_genotypeprop_cvterm_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'pcr_marker_genotyping', 'genotype_property')->cvterm_id();
568 $self->pcr_marker_genotyping_type_id($pcr_marker_genotypeprop_cvterm_id);
570 my $snp_genotypingprop_cvterm_id;
571 if ($self->archived_file_type eq 'genotype_vcf'){
572 $snp_genotypingprop_cvterm_id = $vcf_genotyping_cvterm_id;
573 } elsif ($self->archived_file_type eq 'genotype_dosage'){
574 $snp_genotypingprop_cvterm_id = $vcf_genotyping_cvterm_id;
577 $self->snp_genotypingprop_cvterm_id($snp_genotypingprop_cvterm_id);
579 #remove extra numbers, such as igd after : symbol
580 my @observation_unit_uniquenames_stripped;
581 foreach (@
$observation_unit_uniquenames) {
582 print STDERR
"Now dealing with observation unit $_...\n";
583 $_ =~ s/^\s+|\s+$//g;
584 if ($include_igd_numbers){
585 my ($observation_unit_name_with_accession_name, $igd_number) = split(/:/, $_, 2);
586 $observation_unit_name_with_accession_name =~ s/^\s+|\s+$//g;
587 my ($observation_unit_name, $accession_name) = split(/\|\|\|/, $observation_unit_name_with_accession_name);
588 push @observation_unit_uniquenames_stripped, $observation_unit_name;
590 elsif ($include_lab_numbers){
591 my ($observation_unit_name_with_accession_name, $lab_number) = split(/\./, $_, 2);
592 $observation_unit_name_with_accession_name =~ s/^\s+|\s+$//g;
593 my ($observation_unit_name, $accession_name) = split(/\|\|\|/, $observation_unit_name_with_accession_name);
594 push @observation_unit_uniquenames_stripped, $observation_unit_name;
597 my ($observation_unit_name, $accession_name) = split(/\|\|\|/, $_);
598 push @observation_unit_uniquenames_stripped, $observation_unit_name;
602 my $synonym_type_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'stock_synonym', 'stock_property')->cvterm_id();
603 $self->synonym_type_id($synonym_type_id);
605 my $stock_type = $self->observation_unit_type_name;
608 my $validator = CXGN
::List
::Validate
->new();
609 if ($stock_type eq 'tissue_sample'){
610 @missing_stocks = @
{$validator->validate($schema,'tissue_samples',\
@observation_unit_uniquenames_stripped)->{'missing'}};
611 $stock_type_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, $stock_type, 'stock_type')->cvterm_id();
612 $self->stock_type_id($stock_type_id);
613 } elsif ($stock_type eq 'stocks'){
614 @missing_stocks = @
{$validator->validate($schema,'stocks',\
@observation_unit_uniquenames_stripped)->{'missing'}};
615 } elsif ($stock_type eq 'tissue_sample_or_accession'){
616 @missing_stocks = @
{$validator->validate($schema,'tissue_samples_or_accessions',\
@observation_unit_uniquenames_stripped)->{'missing'}};
617 } elsif ($stock_type eq 'accession'){
618 @missing_stocks = @
{$validator->validate($schema,'accessions',\
@observation_unit_uniquenames_stripped)->{'missing'}};
619 $stock_type_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, $stock_type, 'stock_type')->cvterm_id();
620 $self->stock_type_id($stock_type_id);
623 my $accession_type_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'accession', 'stock_type')->cvterm_id();
624 $self->accession_type_id($accession_type_id);
625 my $q = "SELECT stock.stock_id, stock.uniquename, stockprop.value, stockprop.type_id FROM stock LEFT JOIN stockprop USING(stock_id) WHERE stock.type_id=$accession_type_id AND stock.is_obsolete = 'F';";
626 my $h = $schema->storage->dbh()->prepare($q);
628 while (my ($stock_id, $uniquename, $synonym, $type_id) = $h->fetchrow_array()) {
629 $all_names{$uniquename}++;
631 if ($type_id == $synonym_type_id) {
632 #if (exists($all_names{$synonym})){
633 # my $previous_use = $all_names{$synonym};
634 # push @error_messages, "DATABASE PROBLEM: The synonym $synonym is being used in $previous_use AND $uniquename. PLEASE RESOLVE THIS NOW OR CONTACT US!";
636 #$all_names{$synonym} = $uniquename;
641 push @error_messages, "You can only upload genotype data for a tissue_sample OR accession (including synonyms) OR stocks!"
645 foreach (@missing_stocks){
646 $unique_stocks{$_}++;
649 @missing_stocks = sort keys %unique_stocks;
650 if (scalar(@missing_stocks)>0){
651 push @error_messages, "The following stocks are not in the database: ".join(',',@missing_stocks);
654 #check if protocol_info is correct
655 if ($genotyping_data_type ne 'ssr') {
656 while (my ($chromosome, $protocol_info_chrom) = each %{$protocol_info->{markers
}}) {
657 while (my ($marker_name, $marker_info) = each %{$protocol_info_chrom}) {
658 if (!$marker_name || !$marker_info){
659 push @error_messages, "No genotype info provided";
661 foreach (keys %$marker_info){
662 if ($_ ne 'name' && $_ ne 'chrom' && $_ ne 'pos' && $_ ne 'ref' && $_ ne 'alt' && $_ ne 'qual' && $_ ne 'filter' && $_ ne 'info' && $_ ne 'format' && $_ ne 'intertek_name' && $_ ne 'sequence' && $_ ne 'facility_name'){
663 push @error_messages, "protocol_info key not recognized: $_";
666 if(!exists($marker_info->{'name'})){
667 push @error_messages, "protocol_info missing name key";
669 if(!exists($marker_info->{'chrom'})){
670 push @error_messages, "protocol_info missing chrom key";
672 if(!exists($marker_info->{'pos'})){
673 push @error_messages, "protocol_info missing pos key";
675 if(!exists($marker_info->{'ref'})){
676 push @error_messages, "protocol_info missing ref key";
678 if(!exists($marker_info->{'alt'})){
679 push @error_messages, "protocol_info missing alt key";
683 if (scalar(@
{$protocol_info->{marker_names
}}) == 0){
684 push @error_messages, "No marker info in file";
686 while (my ($chromosome, $protocol_info_chrom) = each %{$protocol_info->{markers_array
}}) {
687 if (scalar(@
{$protocol_info_chrom}) == 0){
688 push @error_messages, "No marker info in markers_array file";
693 if ($genotyping_data_type eq 'ssr') {
694 my @stock_keys = keys %{$genotype_info};
695 my @marker_keys = sort keys %{$genotype_info->{$stock_keys[0]}};
697 my $protocol_marker_names = $protocol_info->{'marker_names'};
698 my @protocol_marker_array = sort @
$protocol_marker_names;
700 my %stored_marker_hash = map {$_=>1} @protocol_marker_array;
701 my @not_stored_markers = grep {!exists $stored_marker_hash{$_}} @marker_keys;
703 if (scalar(@not_stored_markers) > 0) {
704 my $missing_markers = join(",", @not_stored_markers);
705 push @error_messages, "Error: some markers in SSR genotyping data are not in the selected protocol. Missing markers: $missing_markers";
709 #check if genotype_info is correct
710 #print STDERR Dumper($genotype_info);
712 while (my ($observation_unit_name, $marker_result) = each %$genotype_info){
713 if (!$observation_unit_name || !$marker_result){
714 push @error_messages, "No geno info in genotype_info";
718 my $genotype_cvterm_id;
720 if ($genotyping_data_type eq 'ssr') {
721 $genotype_cvterm_id = $pcr_marker_genotypeprop_cvterm_id;
723 $genotype_cvterm_id = $vcf_genotyping_cvterm_id;
726 my $previous_genotypes_search_params = {
727 'me.uniquename' => {-in => \
@observation_unit_uniquenames_stripped},
728 'nd_experiment.type_id' => $geno_cvterm_id,
729 'genotype.type_id' => $genotype_cvterm_id
732 if ($stock_type_id) {
733 $previous_genotypes_search_params->{'me.type_id'} = $stock_type_id;
736 my $previous_genotypes_exist;
737 my $previous_genotypes_rs = $schema->resultset("Stock::Stock")->search($previous_genotypes_search_params, {
738 join => {'nd_experiment_stocks' => {'nd_experiment' => [ {'nd_experiment_genotypes' => 'genotype'}, {'nd_experiment_protocols' => 'nd_protocol'}, {'nd_experiment_projects' => 'project'} ] } },
739 '+select' => ['nd_protocol.nd_protocol_id', 'nd_protocol.name', 'project.project_id', 'project.name'],
740 '+as' => ['protocol_id', 'protocol_name', 'project_id', 'project_name'],
741 order_by
=> 'genotype.genotype_id'
743 while(my $r = $previous_genotypes_rs->next){
744 print STDERR
"PREVIOUS GENOTYPES ".join (",", ($r->get_column('uniquename'), $r->get_column('protocol_name'), $r->get_column('project_name')))."\n";
745 my $uniquename = $r->uniquename;
746 my $protocol_name = $r->get_column('protocol_name');
747 my $project_name = $r->get_column('project_name');
748 push @warning_messages, "$uniquename in your file has already has genotype stored using the protocol $protocol_name in the project $project_name.";
749 $previous_genotypes_exist = 1;
753 error_messages
=> \
@error_messages,
754 warning_messages
=> \
@warning_messages,
755 missing_stocks
=> \
@missing_stocks,
756 previous_genotypes_exist
=> $previous_genotypes_exist
763 print STDERR
"Genotype VCF metadata store\n";
765 my $schema = $self->bcs_schema;
766 my $dbh = $schema->storage->dbh;
767 my $genotype_facility = $self->genotyping_facility;
768 my $opt_p = $self->project_name;
769 my $project_description = $self->project_description;
770 my $opt_y = $self->project_year;
771 my $map_protocol_name = $self->protocol_name;
772 my $map_protocol_description = $self->protocol_description;
773 my $location_id = $self->project_location_id;
774 my $igd_numbers_included = $self->igd_numbers_included;
775 my $lab_numbers_included = $self->lab_numbers_included;
776 my $stock_type = $self->observation_unit_type_name;
777 my $organism_id = $self->organism_id;
778 my $observation_unit_uniquenames = $self->observation_unit_uniquenames;
779 my $genotyping_data_type = $self->genotyping_data_type;
780 my $vcf_genotyping_type = $self->vcf_genotyping_type || 'vcf_snp_genotyping';
781 my $genotyping_type = $self->genotyping_type;
783 $dbh->do('SET search_path TO public,sgn');
785 my $population_cvterm_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'population', 'stock_type')->cvterm_id();
786 $self->population_cvterm_id($population_cvterm_id);
788 my $igd_number_cvterm_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'igd number', 'genotype_property')->cvterm_id();
789 $self->igd_number_cvterm_id($igd_number_cvterm_id);
791 my $vcf_genotyping_cvterm_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, $vcf_genotyping_type, 'genotype_property')->cvterm_id();
793 $self->vcf_genotyping_cvterm_id($vcf_genotyping_cvterm_id);
795 my $geno_cvterm_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'genotyping_experiment', 'experiment_type')->cvterm_id();
796 $self->geno_cvterm_id($geno_cvterm_id);
798 my $pcr_marker_genotyping_type_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'pcr_marker_genotyping', 'genotype_property')->cvterm_id();
799 $self->pcr_marker_genotyping_type_id($pcr_marker_genotyping_type_id);
801 my $vcf_map_details_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'vcf_map_details', 'protocol_property')->cvterm_id();
802 $self->vcf_map_details_id($vcf_map_details_id);
804 my $vcf_map_details_markers_cvterm_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'vcf_map_details_markers', 'protocol_property')->cvterm_id();
805 $self->vcf_map_details_markers_cvterm_id($vcf_map_details_markers_cvterm_id);
807 my $vcf_map_details_markers_array_cvterm_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'vcf_map_details_markers_array', 'protocol_property')->cvterm_id();
808 $self->vcf_map_details_markers_array_cvterm_id($vcf_map_details_markers_array_cvterm_id);
810 my $population_members_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'member_of', 'stock_relationship')->cvterm_id();
811 $self->population_members_id($population_members_id);
813 my $design_cvterm = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'design', 'project_property');
814 $self->design_cvterm($design_cvterm);
816 my $project_year_cvterm = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'project year', 'project_property');
817 $self->project_year_cvterm($project_year_cvterm);
819 my $genotyping_facility_cvterm = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'genotyping_facility', 'project_property');
820 $self->genotyping_facility_cvterm($genotyping_facility_cvterm);
822 #when project_id is provided, a new project is not created
824 my $project_check = $schema->resultset("Project::Project")->find({
825 project_id
=> $self->project_id
828 $project_id = $project_check->project_id;
830 my $project = $schema->resultset("Project::Project")->find_or_create({
832 description
=> $project_description,
834 $project_id = $project->project_id();
835 $project->create_projectprops( { $project_year_cvterm->name() => $opt_y } );
836 $project->create_projectprops( { $self->genotyping_facility_cvterm()->name() => $genotype_facility } );
837 if ($genotyping_data_type eq 'ssr') {
838 $project->create_projectprops( { $self->design_cvterm->name() => 'pcr_genotype_data_project' } );
840 $project->create_projectprops( { $self->design_cvterm->name() => 'genotype_data_project' } );
843 my $t = CXGN
::Trial
->new({
844 bcs_schema
=> $schema,
845 trial_id
=> $project_id
847 $t->set_breeding_program($self->breeding_program_id);
848 $self->project_location_id($location_id);
851 $self->project_id($project_id);
853 #if population name given, so we can add genotyped samples to population
854 my $population_stock;
855 my $population_stock_id;
856 if ($self->accession_population_name){
857 $population_stock = $schema->resultset("Stock::Stock")->find_or_create({
858 organism_id
=> $organism_id,
859 name
=> $self->accession_population_name,
860 uniquename
=> $self->accession_population_name,
861 type_id
=> $self->population_cvterm_id(),
863 $population_stock_id = $population_stock->stock_id();
865 $self->population_stock_id($population_stock_id);
867 #When protocol_id provided, a new protocol is not created
869 my $protocol_row_check = $schema->resultset("NaturalDiversity::NdProtocol")->find({
870 nd_protocol_id
=> $self->protocol_id
872 if ($protocol_row_check){
873 $protocol_id = $protocol_row_check->nd_protocol_id;
875 my $protocol_row = $schema->resultset("NaturalDiversity::NdProtocol")->find_or_create({
876 name
=> $map_protocol_name,
877 type_id
=> $geno_cvterm_id
879 $protocol_id = $protocol_row->nd_protocol_id();
881 my $q = "UPDATE nd_protocol SET description = ? WHERE nd_protocol_id = ?;";
882 my $h = $schema->storage->dbh()->prepare($q);
883 $h->execute($map_protocol_description, $protocol_id);
885 my $new_protocol_info = $self->protocol_info;
886 my $nd_protocolprop_markers = $new_protocol_info->{markers
};
887 my $nd_protocolprop_markers_array = $new_protocol_info->{markers_array
};
889 my %unique_chromosomes;
890 while (my ($chromosome, $protocol_info_chrom) = each %{$nd_protocolprop_markers_array}) {
891 print STDERR
"getting count for chrom $chromosome\n";
892 foreach (@
$protocol_info_chrom) {
893 $unique_chromosomes{$_->{chrom
}}++;
898 foreach my $chr_name (sort keys %unique_chromosomes) {
899 my $marker_count = $unique_chromosomes{$chr_name};
900 $chromosomes{$chr_name} = {
902 marker_count
=> $marker_count
906 print STDERR Dumper \
%chromosomes;
907 $new_protocol_info->{chromosomes
} = \
%chromosomes;
909 delete($new_protocol_info->{markers
});
910 delete($new_protocol_info->{markers_array
});
912 my $nd_protocol_json_string = encode_json
$new_protocol_info;
913 my $new_protocolprop_sql = "INSERT INTO nd_protocolprop (nd_protocol_id, type_id, rank, value) VALUES (?, ?, ?, ?);";
914 my $h_protocolprop = $schema->storage->dbh()->prepare($new_protocolprop_sql);
915 $h_protocolprop->execute($protocol_id, $vcf_map_details_id, 0, $nd_protocol_json_string);
917 foreach my $chr_name (sort keys %unique_chromosomes) {
918 # print STDERR "Chromosome: $chr_name\n";
919 my $rank = $chromosomes{$chr_name}->{rank
};
920 my $nd_protocolprop_markers_json_string = encode_json
$nd_protocolprop_markers->{$chr_name};
921 my $nd_protocolprop_markers_array_json_string = encode_json
$nd_protocolprop_markers_array->{$chr_name};
922 $h_protocolprop->execute($protocol_id, $vcf_map_details_markers_cvterm_id, $rank, $nd_protocolprop_markers_json_string);
923 $h_protocolprop->execute($protocol_id, $vcf_map_details_markers_array_cvterm_id, $rank, $nd_protocolprop_markers_array_json_string);
925 print STDERR
"Protocolprop stored...\n";
928 $self->protocol_id($protocol_id);
930 my $accession_type_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'accession', 'stock_type')->cvterm_id();
931 $self->accession_type_id($accession_type_id);
932 my $tissue_sample_type_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'tissue_sample', 'stock_type')->cvterm_id();
933 $self->tissue_sample_type_id($tissue_sample_type_id);
935 print STDERR
"Generating stock synonym lookup table...\n";
936 tie
my (%stock_lookup), 'Hash::Case::Preserve';
938 my $stock_lookup_where = '';
939 if ($self->stock_type_id) {
940 $stock_lookup_where = 'stock.type_id IN ('.$self->accession_type_id().','.$self->tissue_sample_type_id().') AND';
942 my $q = "SELECT stock.stock_id, stock.uniquename, stockprop.value, stockprop.type_id FROM stock LEFT JOIN stockprop USING(stock_id) WHERE $stock_lookup_where stock.is_obsolete = 'F';";
943 my $h = $schema->storage->dbh()->prepare($q);
946 while (my ($stock_id, $uniquename, $synonym, $type_id) = $h->fetchrow_array()) {
947 $stock_lookup{$uniquename} = { stock_id
=> $stock_id };
948 if ($type_id && $self->synonym_type_id() && ($type_id == $self->synonym_type_id())) {
949 $stock_lookup{$synonym} = { stock_id
=> $stock_id };
953 # Updates stock_lookup to have the genotypeprop_ids for samples previously saved in this protocol/project. Useful for when appending genotypes to the jsonb
954 my $q_g = "SELECT stock.stock_id, stock.uniquename, stockprop.value, stockprop.type_id, nd_experiment_genotype.genotype_id, genotypeprop.genotypeprop_id, genotypeprop.rank
956 LEFT JOIN stockprop USING(stock_id)
957 JOIN nd_experiment_stock USING(stock_id)
958 JOIN nd_experiment_genotype USING(nd_experiment_id)
959 JOIN genotypeprop ON(genotypeprop.genotype_id=nd_experiment_genotype.genotype_id AND genotypeprop.type_id=".$self->vcf_genotyping_cvterm_id.")
960 JOIN nd_experiment_protocol USING(nd_experiment_id)
961 JOIN nd_experiment_project USING(nd_experiment_id)
962 WHERE $stock_lookup_where stock.is_obsolete = 'F' AND nd_protocol_id=$protocol_id AND project_id=$project_id;";
963 my $q_g_h = $schema->storage->dbh()->prepare($q_g);
965 while (my ($stock_id, $uniquename, $synonym, $type_id, $genotype_id, $genotypeprop_id, $chromosome_counter) = $q_g_h->fetchrow_array()) {
966 $stock_lookup{$uniquename} = { stock_id
=> $stock_id, genotype_id
=> $genotype_id, chrom
=> {$chromosome_counter => $genotypeprop_id} };
967 if ($type_id && $type_id == $self->synonym_type_id()) {
968 $stock_lookup{$synonym} = { stock_id
=> $stock_id, genotype_id
=> $genotype_id, chrom
=> {$chromosome_counter => $genotypeprop_id} };
971 $self->stock_lookup(\
%stock_lookup);
972 # print STDERR "Generated lookup table with ".scalar(keys(%stock_lookup))." entries.\n";
974 print STDERR
"Generating md_file entry...\n";
975 #create relationship between nd_experiment and originating archived file
976 my $file = $self->archived_filename;
977 my $md_row = $self->metadata_schema->resultset("MdMetadata")->create({create_person_id
=> $self->user_id});
979 my $upload_file = CXGN
::UploadFile
->new();
980 my $md5 = $upload_file->get_md5($file);
981 my $md5checksum = $md5->hexdigest();
982 my $file_row = $self->metadata_schema->resultset("MdFiles")->create({
983 basename
=> basename
($file),
984 dirname
=> dirname
($file),
985 filetype
=> $self->archived_file_type,
986 md5checksum
=> $md5checksum,
987 metadata_id
=> $md_row->metadata_id(),
989 $self->md_file_id($file_row->file_id());
990 print STDERR
"md_file_id is ".$self->md_file_id()."\n";
993 sub store_identifiers
{
995 my $schema = $self->bcs_schema;
996 my $dbh = $schema->storage->dbh;
997 my $temp_file_sql_copy = $self->temp_file_sql_copy;
998 my $genotyping_data_type = $self->genotyping_data_type;
1000 my $csv = Text
::CSV
->new({ binary
=> 1, auto_diag
=> 1, eol
=> "\n"});
1001 open(my $fh, ">>", $temp_file_sql_copy) or die "Failed to open file $temp_file_sql_copy: $!";
1003 my $genotypeprop_observation_units = $self->genotype_info;
1005 my $new_genotypeprop_sql = "INSERT INTO genotypeprop (genotype_id, type_id, rank, value) VALUES (?, ?, ?, ?) RETURNING genotypeprop_id;";
1006 my $h_new_genotypeprop = $schema->storage->dbh()->prepare($new_genotypeprop_sql);
1008 #Preparing insertion of new genotypes. Will insert/update marker genotype score into genotypeprop jsonb. Only Used when loading standard VCF (non-transposed)
1009 my $update_genotypeprop_sql = "UPDATE genotypeprop SET value = (CASE
1010 WHEN value->? IS NULL
1011 THEN jsonb_insert(value, ?, ?::jsonb)
1012 WHEN value->? IS NOT NULL
1013 THEN jsonb_set(value, ?, ?::jsonb)
1014 END) WHERE genotypeprop_id = ?;";
1015 my $h_genotypeprop = $schema->storage->dbh()->prepare($update_genotypeprop_sql);
1017 my %nd_experiment_ids;
1018 my $stock_relationship_schema = $schema->resultset("Stock::StockRelationship");
1019 my $nd_experiment_schema = $schema->resultset('NaturalDiversity::NdExperiment');
1020 my $genotype_schema = $schema->resultset("Genetic::Genotype");
1021 my $genotypeprop_schema = $schema->resultset("Genetic::Genotypeprop");
1023 my $observation_unit_uniquenames = $self->observation_unit_uniquenames();
1024 foreach (@
$observation_unit_uniquenames) {
1025 $_ =~ s/^\s+|\s+$//g;
1026 # print STDERR "EACH NAME =".Dumper($_)."\n";
1027 my $observation_unit_name_with_accession_name;
1028 my $observation_unit_name;
1032 if ($self->igd_numbers_included()){
1033 ($observation_unit_name_with_accession_name, $igd_number) = split(/:/, $_, 2);
1034 $observation_unit_name_with_accession_name =~ s/^\s+|\s+$//g;
1035 ($observation_unit_name, $accession_name) = split(/\|\|\|/, $observation_unit_name_with_accession_name);
1036 } elsif ($self->lab_numbers_included()) {
1037 ($observation_unit_name_with_accession_name, $lab_number) = split(/\./, $_, 2);
1038 $observation_unit_name_with_accession_name =~ s/^\s+|\s+$//g;
1039 ($observation_unit_name, $accession_name) = split(/\|\|\|/, $observation_unit_name_with_accession_name);
1041 ($observation_unit_name, $accession_name) = split(/\|\|\|/, $_);
1043 # print STDERR "OBSERVATION UNIT NAME =".Dumper($observation_unit_name)."\n";
1044 #print STDERR "SAVING GENOTYPEPROP FOR $observation_unit_name \n";
1045 my $stock_lookup_obj = $self->stock_lookup()->{$observation_unit_name};
1046 my $stock_id = $stock_lookup_obj->{stock_id
};
1047 my $genotype_id = $stock_lookup_obj->{genotype_id
};
1048 # print STDERR "STOCK ID =".Dumper($stock_id)."\n";
1049 my $genotypeprop_json = $genotypeprop_observation_units->{$_};
1050 # print STDERR "GENOTYPEPROP JSON =".Dumper($genotypeprop_json)."\n";
1051 if ($genotypeprop_json) {
1053 if ($self->accession_population_name){
1054 my $pop_rs = $stock_relationship_schema->find_or_create({
1055 type_id
=> $self->population_members_id(),
1056 subject_id
=> $stock_id,
1057 object_id
=> $self->population_stock_id(),
1061 if ( !$self->marker_by_marker_storage || (!$genotype_id && $self->marker_by_marker_storage) ) {
1062 my $experiment = $nd_experiment_schema->create({
1063 nd_geolocation_id
=> $self->project_location_id(),
1064 type_id
=> $self->geno_cvterm_id(),
1065 nd_experiment_projects
=> [ {project_id
=> $self->project_id()} ],
1066 nd_experiment_stocks
=> [ {stock_id
=> $stock_id, type_id
=> $self->geno_cvterm_id() } ],
1067 nd_experiment_protocols
=> [ {nd_protocol_id
=> $self->protocol_id()} ]
1069 my $nd_experiment_id = $experiment->nd_experiment_id();
1070 my $genotype_description;
1071 my $genotype_type_id;
1072 if ($genotyping_data_type eq 'ssr') {
1073 $genotype_description = "SSR genotypes for stock " . "(name = " . $observation_unit_name . ", id = " . $stock_id . ")";
1074 $genotype_type_id = $self->pcr_marker_genotyping_type_id();
1076 $genotype_description = "$genotyping_data_type genotypes for stock " . "(name = " . $observation_unit_name . ", id = " . $stock_id . ")";
1077 $genotype_type_id = $self->vcf_genotyping_cvterm_id();
1080 my $genotype = $genotype_schema->create({
1081 name
=> $observation_unit_name . "|" . $nd_experiment_id,
1082 uniquename
=> $observation_unit_name . "|" . $nd_experiment_id,
1083 description
=> $genotype_description,
1084 type_id
=> $genotype_type_id,
1086 $genotype_id = $genotype->genotype_id();
1087 my $nd_experiment_genotype = $experiment->create_related('nd_experiment_genotypes', { genotype_id
=> $genotype_id } );
1089 #Store IGD number if the option is given.
1090 if ($self->igd_numbers_included()) {
1091 my $add_genotypeprop = $genotypeprop_schema->create({ genotype_id
=> $genotype_id, type_id
=> $self->igd_number_cvterm_id(), value
=> encode_json
{'igd_number' => $igd_number} });
1094 $self->stock_lookup()->{$observation_unit_name}->{genotype_id
} = $genotype_id;
1096 $nd_experiment_ids{$nd_experiment_id}++;
1099 my $chromosome_counter = 0;
1101 if ($genotyping_data_type eq 'ssr') {
1102 my $json_string = encode_json
$genotypeprop_json;
1103 $h_new_genotypeprop->execute($genotype_id, $self->pcr_marker_genotyping_type_id(), $chromosome_counter, $json_string);
1104 my $genotypeprop_id = $h_new_genotypeprop->fetchrow_array();
1105 # print STDERR "GENOTYPEPROP ID =".Dumper($genotypeprop_id)."\n";
1107 foreach my $chromosome (sort keys %$genotypeprop_json) {
1108 my $genotypeprop_id = $stock_lookup_obj->{chrom
}->{$chromosome_counter};
1110 my $chrom_genotypeprop = $genotypeprop_json->{$chromosome};
1112 if ( (!$genotypeprop_id && $self->marker_by_marker_storage) || !$self->marker_by_marker_storage ) {
1114 $chrom_genotypeprop->{CHROM
} = $chromosome;
1116 my $json_string = encode_json
$chrom_genotypeprop;
1117 if ($self->marker_by_marker_storage) { #Used when standard VCF is being stored (NOTE VCF is transpoed prior to parsing by default now), where genotype scores are appended into jsonb.
1118 $h_new_genotypeprop->execute($genotype_id, $self->snp_genotypingprop_cvterm_id(), $chromosome_counter, $json_string);
1119 my ($genotypeprop_id) = $h_new_genotypeprop->fetchrow_array();
1120 $self->stock_lookup()->{$observation_unit_name}->{chrom
}->{$chromosome_counter} = $genotypeprop_id;
1122 else { #Used when transpoed VCF is being stored, or when Intertek files being stored
1123 $csv->print($fh, [ $genotype_id, $self->snp_genotypingprop_cvterm_id(), $chromosome_counter, $json_string ]);
1126 else { #When storing standard VCF, when genotype scores are appended into jsonb one by one. NOTE VCF is transposed by default now prior to parsing, but NOT transposing is relevant for instances where transposing requires too much memory.
1127 while (my ($m, $v) = each %$chrom_genotypeprop) {
1128 my $v_string = encode_json
$v;
1129 $h_genotypeprop->execute($m, '{'.$m.'}', $v_string, $m, '{'.$m.'}', $v_string, $genotypeprop_id);
1133 $chromosome_counter++;
1140 foreach my $nd_experiment_id (keys %nd_experiment_ids) {
1141 my $experiment_files = $self->phenome_schema->resultset("NdExperimentMdFiles")->create({
1142 nd_experiment_id
=> $nd_experiment_id,
1143 file_id
=> $self->md_file_id(),
1149 nd_protocol_id
=> $self->protocol_id(),
1150 project_id
=> $self->project_id(),
1156 sub store_genotypeprop_table
{
1158 my $temp_file_sql_copy = $self->temp_file_sql_copy;
1159 my $dbh = $self->bcs_schema->storage->dbh;
1161 my $SQL = "COPY genotypeprop (genotype_id, type_id, rank, value) FROM STDIN WITH DELIMITER ',' CSV";
1162 my $sth = $dbh->do($SQL);
1164 print STDERR
"SQL COPY $temp_file_sql_copy\n";
1166 open(my $infile, "<", $temp_file_sql_copy) or die "Failed to open file in store_genotypeprop_table() $temp_file_sql_copy: $!";
1167 while (my $line = <$infile>) {
1168 $dbh->pg_putcopydata($line);
1170 $dbh->pg_putcopyend();
1175 nd_protocol_id
=> $self->protocol_id(),
1176 project_id
=> $self->project_id(),