make test pass for multicat parsing with two xlsx files for testing.
[sgn.git] / bin / load_genotypes.pl
blob9816bb38e99ca505335acc9957db2f0cd40b70e4
1 #!/usr/bin/perl
3 =head1
5 load_genotypes.pl - loading genotypes into cxgn databases, based on the load_cassava_snps.pl script by Naama.
7 =head1 SYNOPSIS
9 load_genotypes.pl -H [dbhost] -D [dbname] -i [infile] -p [project name] -y [year] g [population name] -m [protocol name] -t
11 =head1 COMMAND-LINE OPTIONS
12 ARGUMENTS
13 -H host name (required) e.g. "localhost"
14 -D database name (required) e.g. "cxgn_cassava"
15 -i path to infile (required)
16 -p project name (required) e.g. "SNP genotyping 2012 Cornell Biotech"
17 -y project year (required) e.g. "2012"
18 -g population name (required) e.g. "NaCRRI training population"
19 -m protocol name (required) e.g. "GBS ApeKI Cassava genome v6"
20 FLAGS
21 -x delete old genotypes for accessions that have new genotypes
22 -a add accessions that are not in the databaseA
23 -v order markers using version sort
24 -s sort markers according to custom sort order (see script source)
25 -t Test run . Rolling back at the end.
27 =head1 DESCRIPTION
29 This script loads genotype data into the Chado genotype table it encodes the genotype + marker name in a json format in the genotyope.uniquename field for easy parsing by a Perl program. The genotypes are linked to the relevant stock using nd_experiment_genotype. Each column in the spreadsheet, which represents a single accession (stock) is stored as a single genotype entry and linked to the stock via nd_experiment_genotype. Stock names are stored in the stock table if cannot be found, and linked to a population stock with the name supplied in opt_g
31 =head1 AUTHOR
33 Naama Menda (nm249@cornell.edu) - July 2012
34 Modified by Lukas, Jan 2015
36 =cut
38 use strict;
40 use Getopt::Std;
41 use Data::Dumper;
42 use JSON::Any;
43 use JSON::PP;
44 use Carp qw /croak/ ;
45 use Try::Tiny;
46 use Pod::Usage;
48 use Bio::Chado::Schema;
49 use CXGN::People::Person;
50 use CXGN::DB::InsertDBH;
51 use CXGN::Genotype;
52 use CXGN::GenotypeIO;
53 use Sort::Versions;
54 use SGN::Model::Cvterm;
56 our ($opt_H, $opt_D, $opt_i, $opt_t, $opt_p, $opt_y, $opt_g, $opt_a, $opt_x, $opt_v, $opt_s, $opt_m);
58 getopts('H:i:tD:p:y:g:axsm:');
60 my $dbhost = $opt_H;
61 my $dbname = $opt_D;
62 my $file = $opt_i;
63 my $population_name = $opt_g;
64 my $protocol_name = $opt_m;
66 print STDERR "Input file: $file\n";
67 print STDERR "DB host: $dbhost\n";
68 print STDERR "DB name: $dbname\n";
69 print STDERR "Population name: $population_name\n";
70 print STDERR "Project year: $opt_y\n";
71 print STDERR "Add missing accessions: $opt_a\n";
72 print STDERR "Delete old duplicate phenotypes: $opt_x\n";
73 print STDERR "Rollback: $opt_t\n";
75 if (!$opt_H || !$opt_D || !$opt_i || !$opt_g) {
76 pod2usage(-verbose => 2, -message => "Must provide options -H (hostname), -D (database name), -i (input file) , and -g (populations name for associating accessions in your SNP file) \n");
79 my $dbh = CXGN::DB::InsertDBH->new( { dbhost=>$dbhost,
80 dbname=>$dbname,
81 dbargs => {AutoCommit => 1,
82 RaiseError => 1}
87 my $schema= Bio::Chado::Schema->connect( sub { $dbh->get_actual_dbh() } );
88 $dbh->do('SET search_path TO public,sgn');
90 # getting the last database ids for resetting at the end in case of rolling back
92 my $last_nd_experiment_id = $schema->resultset('NaturalDiversity::NdExperiment')->get_column('nd_experiment_id')->max;
93 my $last_cvterm_id = $schema->resultset('Cv::Cvterm')->get_column('cvterm_id')->max;
94 my $last_nd_experiment_project_id = $schema->resultset('NaturalDiversity::NdExperimentProject')->get_column('nd_experiment_project_id')->max;
95 my $last_nd_experiment_stock_id = $schema->resultset('NaturalDiversity::NdExperimentStock')->get_column('nd_experiment_stock_id')->max;
96 my $last_nd_experiment_genotype_id = $schema->resultset('NaturalDiversity::NdExperimentGenotype')->get_column('nd_experiment_genotype_id')->max;
97 my $last_genotype_id = $schema->resultset('Genetic::Genotype')->get_column('genotype_id')->max;
98 my $last_project_id = $schema->resultset('Project::Project')->get_column('project_id')->max;
100 my %seq = (
101 'nd_experiment_nd_experiment_id_seq' => $last_nd_experiment_id,
102 'cvterm_cvterm_id_seq' => $last_cvterm_id,
103 'nd_experiment_project_nd_experiment_project_id_seq' => $last_nd_experiment_project_id,
104 'nd_experiment_stock_nd_experiment_stock_id_seq' => $last_nd_experiment_stock_id,
105 'nd_experiment_genotype_nd_experiment_genotype_id_seq' => $last_nd_experiment_genotype_id,
106 'genotype_genotype_id_seq' => $last_genotype_id,
107 'project_project_id_seq' => $last_project_id,
110 my $accession_cvterm = SGN::Model::Cvterm->get_cvterm_row($schema, 'accession', 'stock_type');
112 my $population_cvterm = SGN::Model::Cvterm->get_cvterm_row($schema, 'training population', 'stock_type');
115 my $igd_number_cvterm = SGN::Model::Cvterm->get_cvterm_row($schema, 'igd number', 'genotype_property');
117 #store a project
118 my $project = $schema->resultset("Project::Project")->find_or_create(
120 name => $opt_p,
121 description => $opt_p,
122 } ) ;
123 $project->create_projectprops( { 'project year' => $opt_y }, { autocreate => 1 } );
125 # find the cvterm for a genotyping experiment
127 my $geno_cvterm = SGN::Model::Cvterm->get_cvterm_row($schema, 'genotyping_experiment', 'experiment_type');
129 my $protocol_row = $schema->resultset("NaturalDiversity::NdProtocol")->find_or_create(
130 { name => $protocol_name,
131 type_id => $geno_cvterm->cvterm_id
134 my $protocol_id = $protocol_row->nd_protocol_id();
136 # find the cvterm for the SNP calling experiment
137 my $snp_genotype = SGN::Model::Cvterm->get_cvterm_row($schema, 'snp genotyping', 'genotype_property');
139 my $geolocation = $schema->resultset("NaturalDiversity::NdGeolocation")->find_or_create(
141 description => 'Cornell Biotech', #add this as an option
142 } ) ;
145 my $organism = $schema->resultset("Organism::Organism")->find_or_create(
147 genus => 'Manihot',
148 species => 'Manihot esculenta',
149 } );
151 my $population_members = SGN::Model::Cvterm->get_cvterm_row($schema, 'member_of', 'stock_relationship');
153 my $organism_id = $organism->organism_id();
154 ########################
156 #new spreadsheet,
157 #my $spreadsheet=CXGN::Tools::File::Spreadsheet->new($file);
159 print STDERR "Reading genotype information...\n";
160 my $gtio = CXGN::GenotypeIO->new( { file => $file, format => "dosage_transposed" });
162 #my @rows = $spreadsheet->row_labels();
163 #my @columns = $spreadsheet->column_labels();
165 my $coderef = sub {
166 while (my $gt = $gtio->next()) {
167 my ($accession_name, $igd_number) = split(/:/, $gt->name());
168 my $db_name = $accession_name;
170 $db_name =~ s/(.*?)\.(.*)/$1/;
172 if ($accession_name eq "marker" || $accession_name =~ /BLANK/i ) {next;}
174 #print Dumper($gt->rawscores);
176 print STDERR "Looking for accession $accession_name ($db_name)\n";
177 my %json;
178 my $cassava_stock;
179 my $stock_name;
180 my $stock_rs = $schema->resultset("Stock::Stock")->search(
181 { 'lower(me.uniquename)' => { like => lc($db_name) },
186 if ($stock_rs->count() == 0) {
188 print STDERR "No uniquename found for $db_name, checking synonyms...\n";
189 $stock_rs = $schema->resultset("Stock::Stock")->search(
191 -and => [
192 'lower(type.name)' => { like => '%synonym%' },
193 'lower(stockprops.value)' => { like => lc($db_name) },
196 { join => { 'stockprops' => 'type'} ,
197 distinct => 1
201 if ($stock_rs->count() >1 ) {
202 print STDERR "ERROR: found multiple accession synonyms found for that accession name, skipping $accession_name! \n";
203 while ( my $st = $stock_rs->next) {
204 print STDERR "stock name = " . $st->uniquename . "\n";
206 next();
209 if ($stock_rs->count() == 1) {
210 $cassava_stock = $stock_rs->first();
211 $stock_name = $cassava_stock->uniquename();
214 if ($stock_rs->count ==0) {
215 print STDERR "No synonym was found for $db_name\n";
217 #store the plant accession in the stock table if $opt_a
219 if (!$opt_a) {
220 print STDERR "WARNING! Accession $accession_name (using: $db_name) not found.\n";
221 print STDERR "Use option -a to add automatically.\n";
222 next();
224 else {
225 $cassava_stock = $schema->resultset("Stock::Stock")->create(
226 { organism_id => $organism_id,
227 name => $db_name,
228 uniquename => $db_name,
229 type_id => $accession_cvterm->cvterm_id,
230 } );
235 my $population_stock = $schema->resultset("Stock::Stock")->find_or_create(
236 { organism_id => $organism_id,
237 name => $population_name,
238 uniquename => $population_name,
239 type_id => $population_cvterm->cvterm_id,
240 } );
242 my $has_rel_rs = $schema->resultset("Stock::StockRelationship")->search(
244 type_id => $population_members->cvterm_id(),
245 subject_id => $cassava_stock->stock_id(),
246 object_id => $population_stock->stock_id(),
249 if ($has_rel_rs->count() == 0) {
250 $cassava_stock->find_or_create_related('stock_relationship_objects', {
251 type_id => $population_members->cvterm_id(),
252 subject_id => $cassava_stock->stock_id(),
253 object_id => $population_stock->stock_id(),
254 } );
256 ###############
257 print STDERR "cassava stock name = " . $cassava_stock->name . "\n";
258 my $experiment = $schema->resultset('NaturalDiversity::NdExperiment')->create(
260 nd_geolocation_id => $geolocation->nd_geolocation_id(),
261 type_id => $geno_cvterm->cvterm_id(),
262 } );
264 print STDERR "Linking to protocol...\n";
265 my $nd_experiment_protocol = $schema->resultset('NaturalDiversity::NdExperimentProtocol')->create(
267 nd_experiment_id => $experiment->nd_experiment_id(),
268 nd_protocol_id => $protocol_id,
272 #link to the project
273 $experiment->find_or_create_related('nd_experiment_projects', {
274 project_id => $project->project_id()
275 } );
276 #link the experiment to the stock
277 $experiment->find_or_create_related('nd_experiment_stocks' , {
278 stock_id => $cassava_stock->stock_id(),
279 type_id => $geno_cvterm->cvterm_id(),
281 if ($opt_x) {
282 print STDERR "OPTION -x: REMOVING OLD GENOTYPE... \n";
283 my $has_genotype_rs = $schema->resultset('NaturalDiversity::NdExperimentStock')->search_related('nd_experiment')->search_related('nd_experiment_genotypes')->search_related('genotype')->search_related('genotypeprops')->search( { 'me.stock_id' => $cassava_stock->stock_id() });
285 while (my $has_genotype = $has_genotype_rs->next()) {
286 print STDERR "Note: -x option: removing already present genotype for $db_name.\n";
287 my $genotypeprop_rs = $schema->resultset('Genetic::Genotypeprop')->search(
288 { genotype_id => $has_genotype->genotype_id() } );
289 while (my $genotypeprop = $genotypeprop_rs->next()) {
290 print STDERR "DELETING GENOTYPE PROP ".$genotypeprop->genotypeprop_id()."\n";
291 $genotypeprop->delete();
293 my $genotype = $schema->resultset('Genetic::Genotypeprop')->search(
294 { 'me.genotype_id' => $has_genotype->genotype_id(), } );
296 print STDERR "DELETING GENOTYPE: ".$has_genotype->genotype_id()."\n";
297 $genotype->delete();
299 #my $nd_experiment_genotypes = $schema->resultset('Genetic::NdExperimentGenotypes')->search( { nd_experiment_id => $has_genotype_rs->nd_experiment_id(), });
304 my @markers = @{$gtio->markers()};
306 if ($opt_s) {
307 @markers = sort bychr @{$gtio->markers()};
309 foreach my $marker_name (@markers) {
310 #print STDERR "markername: $marker_name\n";
311 #print STDERR Dumper($gt->rawscores);
312 my $base_calls = $gt->rawscores->{$marker_name}; #($marker_namefg, $accession_name);
313 #print STDERR "BASE CALL: $base_calls\n";
314 $base_calls =~ s/\s+//g;
315 if ($base_calls !~/[0-9.]+|NA/i) {
316 print STDERR "SKIPPING BASECALL $base_calls\n";
319 $json{$marker_name} = $base_calls;
322 my $json_string;
323 if ($opt_v) {
324 my $json_obj = JSON::PP->new;
325 print STDERR "Sorting and encoding markers and values... \n\n";
326 $json_string = $json_obj->sort_by(sub {versioncmp($JSON::PP::a,$JSON::PP::b)})->encode(\%json);
327 } else {
328 my $json_obj = JSON::Any->new;
329 $json_string = $json_obj->encode(\%json);
332 print STDERR "Storing new genotype for stock " . $cassava_stock->name . " \n\n";
333 my $genotype = $schema->resultset("Genetic::Genotype")->find_or_create(
335 name => $cassava_stock->name . "|" . $experiment->nd_experiment_id,
336 uniquename => $cassava_stock->name . "|" . $experiment->nd_experiment_id,
337 description => "Cassava SNP genotypes for stock " . "(name = " . $cassava_stock->name . ", id = " . $cassava_stock->stock_id . ")",
338 type_id => $snp_genotype->cvterm_id,
341 $genotype->create_genotypeprops( { 'snp genotyping' => $json_string } , {autocreate =>1 , allow_duplicate_values => 1 } );
342 $genotype->create_genotypeprops( { 'igd number' => $igd_number } , {autocreate =>1 , allow_duplicate_values => 1 } );
343 #link the genotype to the nd_experiment
344 my $nd_experiment_genotype = $experiment->find_or_create_related('nd_experiment_genotypes', { genotype_id => $genotype->genotype_id() } );
349 try {
350 $schema->txn_do($coderef);
351 if (!$opt_t) { print "Transaction succeeded! Commiting genotyping experiments! \n\n"; }
352 } catch {
353 # Transaction failed
354 foreach my $value ( keys %seq ) {
355 my $maxval= $seq{$value} || 0;
356 if ($maxval) { $dbh->do("SELECT setval ('$value', $maxval, true)") ; }
357 else { $dbh->do("SELECT setval ('$value', 1, false)"); }
359 die "An error occured! Rolling back and reseting database sequences!" . $_ . "\n";
362 sub bychr {
363 my @a = split "\t", $a;
364 my @b = split "\t", $b;
366 my $a_chr;
367 my $a_coord;
368 my $b_chr;
369 my $b_coord;
371 if ($a[1] =~ /^[A-Za-z]+(\d+)[_-](\d+)$/) {
372 $a_chr = $1;
373 $a_coord = $2;
376 if ($b[1] =~ /[A-Za-z]+(\d+)[_-](\d+)/) {
377 $b_chr = $1;
378 $b_coord = $2;
381 if ($a_chr eq $b_chr) {
382 return $a_coord <=> $b_coord;
384 else {
385 return $a_chr <=> $b_chr;