5 load_genotypes.pl - loading genotypes into cxgn databases, based on the load_cassava_snps.pl script by Naama.
9 load_genotypes.pl -H [dbhost] -D [dbname] -i [infile] -p [project name] -y [year] g [population name] -m [protocol name] -t
11 =head1 COMMAND-LINE OPTIONS
13 -H host name (required) e.g. "localhost"
14 -D database name (required) e.g. "cxgn_cassava"
15 -i path to infile (required)
16 -p project name (required) e.g. "SNP genotyping 2012 Cornell Biotech"
17 -y project year (required) e.g. "2012"
18 -g population name (required) e.g. "NaCRRI training population"
19 -m protocol name (required) e.g. "GBS ApeKI Cassava genome v6"
21 -x delete old genotypes for accessions that have new genotypes
22 -a add accessions that are not in the databaseA
23 -v order markers using version sort
24 -s sort markers according to custom sort order (see script source)
25 -t Test run . Rolling back at the end.
29 This script loads genotype data into the Chado genotype table it encodes the genotype + marker name in a json format in the genotyope.uniquename field for easy parsing by a Perl program. The genotypes are linked to the relevant stock using nd_experiment_genotype. Each column in the spreadsheet, which represents a single accession (stock) is stored as a single genotype entry and linked to the stock via nd_experiment_genotype. Stock names are stored in the stock table if cannot be found, and linked to a population stock with the name supplied in opt_g
33 Naama Menda (nm249@cornell.edu) - July 2012
34 Modified by Lukas, Jan 2015
48 use Bio
::Chado
::Schema
;
49 use CXGN
::People
::Person
;
50 use CXGN
::DB
::InsertDBH
;
54 use SGN
::Model
::Cvterm
;
56 our ($opt_H, $opt_D, $opt_i, $opt_t, $opt_p, $opt_y, $opt_g, $opt_a, $opt_x, $opt_v, $opt_s, $opt_m);
58 getopts
('H:i:tD:p:y:g:axsm:');
63 my $population_name = $opt_g;
64 my $protocol_name = $opt_m;
66 print STDERR
"Input file: $file\n";
67 print STDERR
"DB host: $dbhost\n";
68 print STDERR
"DB name: $dbname\n";
69 print STDERR
"Population name: $population_name\n";
70 print STDERR
"Project year: $opt_y\n";
71 print STDERR
"Add missing accessions: $opt_a\n";
72 print STDERR
"Delete old duplicate phenotypes: $opt_x\n";
73 print STDERR
"Rollback: $opt_t\n";
75 if (!$opt_H || !$opt_D || !$opt_i || !$opt_g) {
76 pod2usage
(-verbose
=> 2, -message
=> "Must provide options -H (hostname), -D (database name), -i (input file) , and -g (populations name for associating accessions in your SNP file) \n");
79 my $dbh = CXGN
::DB
::InsertDBH
->new( { dbhost
=>$dbhost,
81 dbargs
=> {AutoCommit
=> 1,
87 my $schema= Bio
::Chado
::Schema
->connect( sub { $dbh->get_actual_dbh() } );
88 $dbh->do('SET search_path TO public,sgn');
90 # getting the last database ids for resetting at the end in case of rolling back
92 my $last_nd_experiment_id = $schema->resultset('NaturalDiversity::NdExperiment')->get_column('nd_experiment_id')->max;
93 my $last_cvterm_id = $schema->resultset('Cv::Cvterm')->get_column('cvterm_id')->max;
94 my $last_nd_experiment_project_id = $schema->resultset('NaturalDiversity::NdExperimentProject')->get_column('nd_experiment_project_id')->max;
95 my $last_nd_experiment_stock_id = $schema->resultset('NaturalDiversity::NdExperimentStock')->get_column('nd_experiment_stock_id')->max;
96 my $last_nd_experiment_genotype_id = $schema->resultset('NaturalDiversity::NdExperimentGenotype')->get_column('nd_experiment_genotype_id')->max;
97 my $last_genotype_id = $schema->resultset('Genetic::Genotype')->get_column('genotype_id')->max;
98 my $last_project_id = $schema->resultset('Project::Project')->get_column('project_id')->max;
101 'nd_experiment_nd_experiment_id_seq' => $last_nd_experiment_id,
102 'cvterm_cvterm_id_seq' => $last_cvterm_id,
103 'nd_experiment_project_nd_experiment_project_id_seq' => $last_nd_experiment_project_id,
104 'nd_experiment_stock_nd_experiment_stock_id_seq' => $last_nd_experiment_stock_id,
105 'nd_experiment_genotype_nd_experiment_genotype_id_seq' => $last_nd_experiment_genotype_id,
106 'genotype_genotype_id_seq' => $last_genotype_id,
107 'project_project_id_seq' => $last_project_id,
110 my $accession_cvterm = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'accession', 'stock_type');
112 my $population_cvterm = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'training population', 'stock_type');
115 my $igd_number_cvterm = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'igd number', 'genotype_property');
118 my $project = $schema->resultset("Project::Project")->find_or_create(
121 description
=> $opt_p,
123 $project->create_projectprops( { 'project year' => $opt_y }, { autocreate
=> 1 } );
125 # find the cvterm for a genotyping experiment
127 my $geno_cvterm = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'genotyping_experiment', 'experiment_type');
129 my $protocol_row = $schema->resultset("NaturalDiversity::NdProtocol")->find_or_create(
130 { name
=> $protocol_name,
131 type_id
=> $geno_cvterm->cvterm_id
134 my $protocol_id = $protocol_row->nd_protocol_id();
136 # find the cvterm for the SNP calling experiment
137 my $snp_genotype = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'snp genotyping', 'genotype_property');
139 my $geolocation = $schema->resultset("NaturalDiversity::NdGeolocation")->find_or_create(
141 description
=> 'Cornell Biotech', #add this as an option
145 my $organism = $schema->resultset("Organism::Organism")->find_or_create(
148 species
=> 'Manihot esculenta',
151 my $population_members = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'member_of', 'stock_relationship');
153 my $organism_id = $organism->organism_id();
154 ########################
157 #my $spreadsheet=CXGN::Tools::File::Spreadsheet->new($file);
159 print STDERR
"Reading genotype information...\n";
160 my $gtio = CXGN
::GenotypeIO
->new( { file
=> $file, format
=> "dosage_transposed" });
162 #my @rows = $spreadsheet->row_labels();
163 #my @columns = $spreadsheet->column_labels();
166 while (my $gt = $gtio->next()) {
167 my ($accession_name, $igd_number) = split(/:/, $gt->name());
168 my $db_name = $accession_name;
170 $db_name =~ s/(.*?)\.(.*)/$1/;
172 if ($accession_name eq "marker" || $accession_name =~ /BLANK/i ) {next;}
174 #print Dumper($gt->rawscores);
176 print STDERR
"Looking for accession $accession_name ($db_name)\n";
180 my $stock_rs = $schema->resultset("Stock::Stock")->search(
181 { 'lower(me.uniquename)' => { like
=> lc($db_name) },
186 if ($stock_rs->count() == 0) {
188 print STDERR
"No uniquename found for $db_name, checking synonyms...\n";
189 $stock_rs = $schema->resultset("Stock::Stock")->search(
192 'lower(type.name)' => { like
=> '%synonym%' },
193 'lower(stockprops.value)' => { like
=> lc($db_name) },
196 { join => { 'stockprops' => 'type'} ,
201 if ($stock_rs->count() >1 ) {
202 print STDERR
"ERROR: found multiple accession synonyms found for that accession name, skipping $accession_name! \n";
203 while ( my $st = $stock_rs->next) {
204 print STDERR
"stock name = " . $st->uniquename . "\n";
209 if ($stock_rs->count() == 1) {
210 $cassava_stock = $stock_rs->first();
211 $stock_name = $cassava_stock->uniquename();
214 if ($stock_rs->count ==0) {
215 print STDERR
"No synonym was found for $db_name\n";
217 #store the plant accession in the stock table if $opt_a
220 print STDERR
"WARNING! Accession $accession_name (using: $db_name) not found.\n";
221 print STDERR
"Use option -a to add automatically.\n";
225 $cassava_stock = $schema->resultset("Stock::Stock")->create(
226 { organism_id
=> $organism_id,
228 uniquename
=> $db_name,
229 type_id
=> $accession_cvterm->cvterm_id,
235 my $population_stock = $schema->resultset("Stock::Stock")->find_or_create(
236 { organism_id
=> $organism_id,
237 name
=> $population_name,
238 uniquename
=> $population_name,
239 type_id
=> $population_cvterm->cvterm_id,
242 my $has_rel_rs = $schema->resultset("Stock::StockRelationship")->search(
244 type_id
=> $population_members->cvterm_id(),
245 subject_id
=> $cassava_stock->stock_id(),
246 object_id
=> $population_stock->stock_id(),
249 if ($has_rel_rs->count() == 0) {
250 $cassava_stock->find_or_create_related('stock_relationship_objects', {
251 type_id
=> $population_members->cvterm_id(),
252 subject_id
=> $cassava_stock->stock_id(),
253 object_id
=> $population_stock->stock_id(),
257 print STDERR
"cassava stock name = " . $cassava_stock->name . "\n";
258 my $experiment = $schema->resultset('NaturalDiversity::NdExperiment')->create(
260 nd_geolocation_id
=> $geolocation->nd_geolocation_id(),
261 type_id
=> $geno_cvterm->cvterm_id(),
264 print STDERR
"Linking to protocol...\n";
265 my $nd_experiment_protocol = $schema->resultset('NaturalDiversity::NdExperimentProtocol')->create(
267 nd_experiment_id
=> $experiment->nd_experiment_id(),
268 nd_protocol_id
=> $protocol_id,
273 $experiment->find_or_create_related('nd_experiment_projects', {
274 project_id
=> $project->project_id()
276 #link the experiment to the stock
277 $experiment->find_or_create_related('nd_experiment_stocks' , {
278 stock_id
=> $cassava_stock->stock_id(),
279 type_id
=> $geno_cvterm->cvterm_id(),
282 print STDERR
"OPTION -x: REMOVING OLD GENOTYPE... \n";
283 my $has_genotype_rs = $schema->resultset('NaturalDiversity::NdExperimentStock')->search_related('nd_experiment')->search_related('nd_experiment_genotypes')->search_related('genotype')->search_related('genotypeprops')->search( { 'me.stock_id' => $cassava_stock->stock_id() });
285 while (my $has_genotype = $has_genotype_rs->next()) {
286 print STDERR
"Note: -x option: removing already present genotype for $db_name.\n";
287 my $genotypeprop_rs = $schema->resultset('Genetic::Genotypeprop')->search(
288 { genotype_id
=> $has_genotype->genotype_id() } );
289 while (my $genotypeprop = $genotypeprop_rs->next()) {
290 print STDERR
"DELETING GENOTYPE PROP ".$genotypeprop->genotypeprop_id()."\n";
291 $genotypeprop->delete();
293 my $genotype = $schema->resultset('Genetic::Genotypeprop')->search(
294 { 'me.genotype_id' => $has_genotype->genotype_id(), } );
296 print STDERR
"DELETING GENOTYPE: ".$has_genotype->genotype_id()."\n";
299 #my $nd_experiment_genotypes = $schema->resultset('Genetic::NdExperimentGenotypes')->search( { nd_experiment_id => $has_genotype_rs->nd_experiment_id(), });
304 my @markers = @
{$gtio->markers()};
307 @markers = sort bychr @
{$gtio->markers()};
309 foreach my $marker_name (@markers) {
310 #print STDERR "markername: $marker_name\n";
311 #print STDERR Dumper($gt->rawscores);
312 my $base_calls = $gt->rawscores->{$marker_name}; #($marker_namefg, $accession_name);
313 #print STDERR "BASE CALL: $base_calls\n";
314 $base_calls =~ s/\s+//g;
315 if ($base_calls !~/[0-9.]+|NA/i) {
316 print STDERR
"SKIPPING BASECALL $base_calls\n";
319 $json{$marker_name} = $base_calls;
324 my $json_obj = JSON
::PP
->new;
325 print STDERR
"Sorting and encoding markers and values... \n\n";
326 $json_string = $json_obj->sort_by(sub {versioncmp
($JSON::PP
::a
,$JSON::PP
::b
)})->encode(\
%json);
328 my $json_obj = JSON
::Any
->new;
329 $json_string = $json_obj->encode(\
%json);
332 print STDERR
"Storing new genotype for stock " . $cassava_stock->name . " \n\n";
333 my $genotype = $schema->resultset("Genetic::Genotype")->find_or_create(
335 name
=> $cassava_stock->name . "|" . $experiment->nd_experiment_id,
336 uniquename
=> $cassava_stock->name . "|" . $experiment->nd_experiment_id,
337 description
=> "Cassava SNP genotypes for stock " . "(name = " . $cassava_stock->name . ", id = " . $cassava_stock->stock_id . ")",
338 type_id
=> $snp_genotype->cvterm_id,
341 $genotype->create_genotypeprops( { 'snp genotyping' => $json_string } , {autocreate
=>1 , allow_duplicate_values
=> 1 } );
342 $genotype->create_genotypeprops( { 'igd number' => $igd_number } , {autocreate
=>1 , allow_duplicate_values
=> 1 } );
343 #link the genotype to the nd_experiment
344 my $nd_experiment_genotype = $experiment->find_or_create_related('nd_experiment_genotypes', { genotype_id
=> $genotype->genotype_id() } );
350 $schema->txn_do($coderef);
351 if (!$opt_t) { print "Transaction succeeded! Commiting genotyping experiments! \n\n"; }
354 foreach my $value ( keys %seq ) {
355 my $maxval= $seq{$value} || 0;
356 if ($maxval) { $dbh->do("SELECT setval ('$value', $maxval, true)") ; }
357 else { $dbh->do("SELECT setval ('$value', 1, false)"); }
359 die "An error occured! Rolling back and reseting database sequences!" . $_ . "\n";
363 my @a = split "\t", $a;
364 my @b = split "\t", $b;
371 if ($a[1] =~ /^[A-Za-z]+(\d+)[_-](\d+)$/) {
376 if ($b[1] =~ /[A-Za-z]+(\d+)[_-](\d+)/) {
381 if ($a_chr eq $b_chr) {
382 return $a_coord <=> $b_coord;
385 return $a_chr <=> $b_chr;